Project

General

Profile

1
/*
2
 * This file is part of CoAnSys project.
3
 * Copyright (c) 2012-2015 ICM-UW
4
 * 
5
 * CoAnSys is free software: you can redistribute it and/or modify
6
 * it under the terms of the GNU Affero General Public License as published by
7
 * the Free Software Foundation, either version 3 of the License, or
8
 * (at your option) any later version.
9

    
10
 * CoAnSys is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU Affero General Public License for more details.
14
 * 
15
 * You should have received a copy of the GNU Affero General Public License
16
 * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
17
 */
18
package eu.dnetlib.dhp.common.string;
19

    
20
import java.io.Serializable;
21
import java.util.List;
22

    
23
import org.apache.commons.lang3.StringUtils;
24

    
25
import com.google.common.collect.ImmutableList;
26

    
27
/**
28
 * An implementation of {@link StringNormalizer} that normalizes strings for non-strict comparisons
29
 * in which one does not care about characters other than letters and digits or about differently written diacritics.
30
 *
31
 * @author Łukasz Dumiszewski
32
 *
33
 */
34
public final class LenientComparisonStringNormalizer implements StringNormalizer, Serializable {
35

    
36

    
37
    private static final long serialVersionUID = 1L;
38
    
39
    
40
    private List<Character> whitelistCharacters;
41
    
42
    
43
    //------------------------ CONSTRUCTORS --------------------------
44
    
45
    public LenientComparisonStringNormalizer() {
46
        this(ImmutableList.of());
47
    }
48
    
49
    /**
50
     * @param whitelistCharacters - non alphanumeric characters that will not be removed
51
     *      during normalization
52
     */
53
    public LenientComparisonStringNormalizer(List<Character> whitelistCharacters) {
54
        this.whitelistCharacters = whitelistCharacters;
55
    }
56
    
57
    
58
    //------------------------ LOGIC --------------------------
59

    
60
    
61
    
62
    /**
63
     * Normalizes the given value. <br/>
64
     * The normalized strings are better suited for non-strict comparisons, in which one does NOT care about characters that are
65
     * neither letters nor digits; about accidental spaces or different diacritics etc. <br/><br/>
66
     * This method:
67
     * <ul>
68
     * <li>Replaces all characters that are not letters or digits with spaces (except those on whitelist characters list)</li>
69
     * <li>Replaces white spaces with spaces </li>
70
     * <li>Trims</li>
71
     * <li>Compacts multi-space gaps to one-space gaps</li>
72
     * <li>Removes diacritics</li>
73
     * <li>Changes characters to lower case</li>
74
     * </ul>
75
     * Returns "" if the passed value is null or blank
76
     *
77
     * @param value the string to normalize 
78
     * @see DiacriticsRemover#removeDiacritics(String, boolean)
79
     *
80
     *
81
     */
82
    public String normalize(String value) {
83
        
84
        if (StringUtils.isBlank(value)) {
85
        
86
            return "";
87

    
88
        }
89
        
90
        
91
        String result = value;
92
        
93
        result = DiacriticsRemover.removeDiacritics(result);
94
        
95
        result = removeNonLetterDigitCharacters(result);
96
        
97
        result = result.toLowerCase();
98
        
99
        result = result.trim().replaceAll(" +", " ");
100
        
101
        return result;
102
    }
103
    
104
    
105
    
106
    
107
    //------------------------ PRIVATE --------------------------
108

    
109
    
110
    private String removeNonLetterDigitCharacters(final String value) {
111
        
112
        StringBuilder sb = new StringBuilder();
113
        
114
        for (int i = 0; i < value.length(); ++i) {
115
   
116
            char c = value.charAt(i);
117
            
118
            if (Character.isLetterOrDigit(c) || whitelistCharacters.contains(c)) {
119
                sb.append(c);
120
            } else {
121
                sb.append(' ');
122
            }
123
        }
124
        
125
        return sb.toString();
126
    }
127

    
128
 
129

    
130
}
(3-3/4)