Project

General

Profile

1
package eu.dnetlib.index.utils;
2

    
3
import java.util.function.UnaryOperator;
4

    
5
/**
6
 * This function removes extra highlight tags from the given document,
7
 * according to the CLEAN_REGEX regular expression
8
 *
9
 * @param document the document
10
 * @author claudio
11
 * @return cleaned document
12
 */
13
public class HighlightUtils implements UnaryOperator<String> {
14

    
15
	public final static String DEFAULT_HL_PRE = "[hl]";
16

    
17
	public final static String DEFAULT_HL_POST = "[/hl]";
18

    
19
	private static String CLEAN_HEADER = "s#\\[/?hl\\]##gm";
20
	private static String CLEAN_REGEX_OPEN = "<([^>]*)\\[hl\\]([^>]*)>";
21
	private static String CLEAN_REGEX_CLOSE = "<([^>]*)\\[\\/hl\\]([^>]*)>";
22

    
23
	//TODO: implement a faster way to do this
24
	private String cleanBody(String body) {
25
		String res = body.replaceAll(CLEAN_REGEX_OPEN, "<$1$2>").replaceAll(CLEAN_REGEX_CLOSE, "<$1$2>");
26

    
27
		if (res.equals(body))
28
			return res;
29

    
30
		return cleanBody(res);
31
	}
32

    
33
	@Override
34
	public String apply(final String doc) {
35
		String[] chunk = doc.split("</header>");
36
		String string = chunk[0].replaceAll("\\[/?hl\\]", "") + "</header>" + cleanBody(chunk[1]);
37
		return string;
38
	}
39
}
(1-1/3)