Project

General

Profile

1
/*
2
 * This file is part of CoAnSys project.
3
 * Copyright (c) 2012-2015 ICM-UW
4
 * 
5
 * CoAnSys is free software: you can redistribute it and/or modify
6
 * it under the terms of the GNU Affero General Public License as published by
7
 * the Free Software Foundation, either version 3 of the License, or
8
 * (at your option) any later version.
9

    
10
 * CoAnSys is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU Affero General Public License for more details.
14
 * 
15
 * You should have received a copy of the GNU Affero General Public License
16
 * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
17
 */
18

    
19
package eu.dnetlib.dhp.common.string;
20

    
21
import java.text.Normalizer;
22
import java.util.HashMap;
23
import java.util.Map;
24

    
25
/**
26
 * Mapping to the basic Latin alphabet (a-z, A-Z). In most cases, a character is
27
 * mapped to the closest visual form, rather than functional one, e.g.: "ö" is
28
 * mapped to "o" rather than "oe", and "đ" is mapped to "d" rather than "dj" or
29
 * "gj". Notable exceptions include: "ĸ" mapped to "q", "ß" mapped to "ss", and
30
 * "Þ", "þ" mapped to "Y", "y".
31
 *
32
 * <p> Each character is processed as follows: <ol> <li>the character is
33
 * compatibility decomposed,</li> <li>all the combining marks are removed,</li>
34
 * <li>the character is compatibility composed,</li> <li>additional "manual"
35
 * substitutions are applied.</li> </ol> </p>
36
 *
37
 * <p> All the characters from the "Latin-1 Supplement" and "Latin Extended-A"
38
 * Unicode blocks are mapped to the "Basic Latin" block. Characters from other
39
 * alphabets are generally left intact, although the decomposable ones may be
40
 * affected by the procedure. </p>
41
 *
42
 * @author Lukasz Bolikowski (bolo@icm.edu.pl)
43
 * 
44
 * @author Łukasz Dumiszewski /just copied from coansys-commons/
45
 *
46
 */
47
public final class DiacriticsRemover {
48

    
49
    private static final Character[] from = {
50
        'Æ', 'Ð', 'Ø', 'Þ', 'ß', 'æ', 'ð', 'ø', 'þ', 'Đ', 'đ', 'Ħ',
51
        'ħ', 'ı', 'ĸ', 'Ł', 'ł', 'Ŋ', 'ŋ', 'Œ', 'œ', 'Ŧ', 'ŧ'};
52
    private static final String[] to = {
53
        "AE", "D", "O", "Y", "ss", "ae", "d", "o", "y", "D", "d", "H",
54
        "h", "i", "q", "L", "l", "N", "n", "OE", "oe", "T", "t"};
55
    
56
    private static Map<Character, String> lookup = buildLookup();
57
    
58
    
59
    //------------------------ CONSTRUCTORS -------------------
60
    
61
    
62
    private DiacriticsRemover() {}
63
    
64

    
65
    //------------------------ LOGIC --------------------------
66
    
67
    
68
    /**
69
     * Removes diacritics from a text.
70
     *
71
     * @param text Text to process.
72
     * @return Text without diacritics.
73
     */
74
    public static String removeDiacritics(String text) {
75
        if (text == null) {
76
            return null;
77
        }
78

    
79
        String tmp = Normalizer.normalize(text, Normalizer.Form.NFKD);
80

    
81
        StringBuilder builder = new StringBuilder();
82
        for (int i = 0; i < tmp.length(); i++) {
83
            Character ch = tmp.charAt(i);
84
            if (Character.getType(ch) == Character.NON_SPACING_MARK) {
85
                continue;
86
            }
87

    
88
            if (lookup.containsKey(ch)) {
89
                builder.append(lookup.get(ch));
90
            } else {
91
                builder.append(ch);
92
            }
93
        }
94

    
95
        return builder.toString();
96
    }
97

    
98

    
99
    //------------------------ PRIVATE --------------------------
100
    
101
    private static Map<Character, String> buildLookup() {
102
        if (from.length != to.length) {
103
            throw new IllegalStateException();
104
        }
105

    
106
        Map<Character, String> _lookup = new HashMap<Character, String>();
107
        for (int i = 0; i < from.length; i++) {
108
            _lookup.put(from[i], to[i]);
109
        }
110

    
111
        return _lookup;
112
    }
113
}
(2-2/4)