1
|
/*
|
2
|
* This file is part of CoAnSys project.
|
3
|
* Copyright (c) 2012-2015 ICM-UW
|
4
|
*
|
5
|
* CoAnSys is free software: you can redistribute it and/or modify
|
6
|
* it under the terms of the GNU Affero General Public License as published by
|
7
|
* the Free Software Foundation, either version 3 of the License, or
|
8
|
* (at your option) any later version.
|
9
|
|
10
|
* CoAnSys is distributed in the hope that it will be useful,
|
11
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
* GNU Affero General Public License for more details.
|
14
|
*
|
15
|
* You should have received a copy of the GNU Affero General Public License
|
16
|
* along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
|
17
|
*/
|
18
|
package eu.dnetlib.dhp.common.string;
|
19
|
|
20
|
import java.io.Serializable;
|
21
|
import java.util.List;
|
22
|
|
23
|
import org.apache.commons.lang3.StringUtils;
|
24
|
|
25
|
import com.google.common.collect.ImmutableList;
|
26
|
|
27
|
/**
|
28
|
* An implementation of {@link StringNormalizer} that normalizes strings for non-strict comparisons
|
29
|
* in which one does not care about characters other than letters and digits or about differently written diacritics.
|
30
|
*
|
31
|
* @author Łukasz Dumiszewski
|
32
|
*
|
33
|
*/
|
34
|
public final class LenientComparisonStringNormalizer implements StringNormalizer, Serializable {
|
35
|
|
36
|
|
37
|
private static final long serialVersionUID = 1L;
|
38
|
|
39
|
|
40
|
private List<Character> whitelistCharacters;
|
41
|
|
42
|
|
43
|
//------------------------ CONSTRUCTORS --------------------------
|
44
|
|
45
|
public LenientComparisonStringNormalizer() {
|
46
|
this(ImmutableList.of());
|
47
|
}
|
48
|
|
49
|
/**
|
50
|
* @param whitelistCharacters - non alphanumeric characters that will not be removed
|
51
|
* during normalization
|
52
|
*/
|
53
|
public LenientComparisonStringNormalizer(List<Character> whitelistCharacters) {
|
54
|
this.whitelistCharacters = whitelistCharacters;
|
55
|
}
|
56
|
|
57
|
|
58
|
//------------------------ LOGIC --------------------------
|
59
|
|
60
|
|
61
|
|
62
|
/**
|
63
|
* Normalizes the given value. <br/>
|
64
|
* The normalized strings are better suited for non-strict comparisons, in which one does NOT care about characters that are
|
65
|
* neither letters nor digits; about accidental spaces or different diacritics etc. <br/><br/>
|
66
|
* This method:
|
67
|
* <ul>
|
68
|
* <li>Replaces all characters that are not letters or digits with spaces (except those on whitelist characters list)</li>
|
69
|
* <li>Replaces white spaces with spaces </li>
|
70
|
* <li>Trims</li>
|
71
|
* <li>Compacts multi-space gaps to one-space gaps</li>
|
72
|
* <li>Removes diacritics</li>
|
73
|
* <li>Changes characters to lower case</li>
|
74
|
* </ul>
|
75
|
* Returns "" if the passed value is null or blank
|
76
|
*
|
77
|
* @param value the string to normalize
|
78
|
* @see DiacriticsRemover#removeDiacritics(String, boolean)
|
79
|
*
|
80
|
*
|
81
|
*/
|
82
|
public String normalize(String value) {
|
83
|
|
84
|
if (StringUtils.isBlank(value)) {
|
85
|
|
86
|
return "";
|
87
|
|
88
|
}
|
89
|
|
90
|
|
91
|
String result = value;
|
92
|
|
93
|
result = DiacriticsRemover.removeDiacritics(result);
|
94
|
|
95
|
result = removeNonLetterDigitCharacters(result);
|
96
|
|
97
|
result = result.toLowerCase();
|
98
|
|
99
|
result = result.trim().replaceAll(" +", " ");
|
100
|
|
101
|
return result;
|
102
|
}
|
103
|
|
104
|
|
105
|
|
106
|
|
107
|
//------------------------ PRIVATE --------------------------
|
108
|
|
109
|
|
110
|
private String removeNonLetterDigitCharacters(final String value) {
|
111
|
|
112
|
StringBuilder sb = new StringBuilder();
|
113
|
|
114
|
for (int i = 0; i < value.length(); ++i) {
|
115
|
|
116
|
char c = value.charAt(i);
|
117
|
|
118
|
if (Character.isLetterOrDigit(c) || whitelistCharacters.contains(c)) {
|
119
|
sb.append(c);
|
120
|
} else {
|
121
|
sb.append(' ');
|
122
|
}
|
123
|
}
|
124
|
|
125
|
return sb.toString();
|
126
|
}
|
127
|
|
128
|
|
129
|
|
130
|
}
|