1
|
package eu.dnetlib.index.utils;
|
2
|
|
3
|
import java.util.function.UnaryOperator;
|
4
|
|
5
|
/**
|
6
|
* This function removes extra highlight tags from the given document,
|
7
|
* according to the CLEAN_REGEX regular expression
|
8
|
*
|
9
|
* @param document the document
|
10
|
* @author claudio
|
11
|
* @return cleaned document
|
12
|
*/
|
13
|
public class HighlightUtils implements UnaryOperator<String> {
|
14
|
|
15
|
public final static String DEFAULT_HL_PRE = "[hl]";
|
16
|
|
17
|
public final static String DEFAULT_HL_POST = "[/hl]";
|
18
|
|
19
|
private static String CLEAN_HEADER = "s#\\[/?hl\\]##gm";
|
20
|
private static String CLEAN_REGEX_OPEN = "<([^>]*)\\[hl\\]([^>]*)>";
|
21
|
private static String CLEAN_REGEX_CLOSE = "<([^>]*)\\[\\/hl\\]([^>]*)>";
|
22
|
|
23
|
//TODO: implement a faster way to do this
|
24
|
private String cleanBody(String body) {
|
25
|
String res = body.replaceAll(CLEAN_REGEX_OPEN, "<$1$2>").replaceAll(CLEAN_REGEX_CLOSE, "<$1$2>");
|
26
|
|
27
|
if (res.equals(body))
|
28
|
return res;
|
29
|
|
30
|
return cleanBody(res);
|
31
|
}
|
32
|
|
33
|
@Override
|
34
|
public String apply(final String doc) {
|
35
|
String[] chunk = doc.split("</header>");
|
36
|
String string = chunk[0].replaceAll("\\[/?hl\\]", "") + "</header>" + cleanBody(chunk[1]);
|
37
|
return string;
|
38
|
}
|
39
|
}
|