1
|
package eu.dnetlib.data.information.oai.publisher;
|
2
|
|
3
|
import java.text.Normalizer;
|
4
|
|
5
|
import org.apache.commons.lang3.StringEscapeUtils;
|
6
|
import org.junit.Test;
|
7
|
|
8
|
import static org.junit.Assert.assertEquals;
|
9
|
import static org.junit.Assert.assertFalse;
|
10
|
|
11
|
public class PublisherMiscTest {
|
12
|
|
13
|
@Test
|
14
|
public void test() {
|
15
|
String id = "NonavCreation.filmportal.de/DIF_NonAVCreation_EUROPA_TM & © Aardman Animations, LTD";
|
16
|
String newId = StringEscapeUtils.escapeXml11(id);
|
17
|
assertEquals("NonavCreation.filmportal.de/DIF_NonAVCreation_EUROPA_TM & © Aardman Animations, LTD", newId);
|
18
|
assertFalse(id.equals(newId));
|
19
|
}
|
20
|
|
21
|
@Test
|
22
|
public void test2() {
|
23
|
// Hochschulschriftenserver - Universität Frankfurt am Main
|
24
|
String s = "Publikationenserver der Georg-August-Universität Göttingen";
|
25
|
System.out.println("String to normalize: " + s);
|
26
|
s = StringEscapeUtils.unescapeXml(s);
|
27
|
System.out.println("unescaped: " + s);
|
28
|
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
29
|
System.out.println("normalized: " + s);
|
30
|
// remove tilde, dots... over letters
|
31
|
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^-_]]", "");
|
32
|
// change punctuation into an underscore
|
33
|
s = s.replaceAll("[\\p{Punct}&&[^-_]]", "_");
|
34
|
// remove all non-word charcheters
|
35
|
s = s.replaceAll("[\\W&&[^-_]]", "");
|
36
|
System.out.println("Converted setSpec to: " + s);
|
37
|
}
|
38
|
|
39
|
}
|