Revision 53193
Added by Claudio Atzori over 5 years ago
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import com.google.common.collect.Sets; |
|
4 |
|
|
5 |
import java.net.MalformedURLException; |
|
6 |
import java.net.URL; |
|
7 |
import java.util.Collection; |
|
8 |
import java.util.Map; |
|
9 |
|
|
10 |
public class UrlClustering extends AbstractClusteringFunction { |
|
11 |
|
|
12 |
public UrlClustering(Map<String, Integer> params) { |
|
13 |
super(params); |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
protected Collection<String> doApply(String s) { |
|
18 |
return Sets.newHashSet(asUrl(s).getHost()); |
|
19 |
} |
|
20 |
|
|
21 |
private URL asUrl(final String value) { |
|
22 |
try { |
|
23 |
return new URL(value); |
|
24 |
} catch (MalformedURLException e) { |
|
25 |
// should not happen as checked by pace typing |
|
26 |
throw new IllegalStateException("invalid URL: " + value); |
|
27 |
} |
|
28 |
} |
|
29 |
|
|
30 |
} |
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java | ||
---|---|---|
3 | 3 |
import java.util.Collection; |
4 | 4 |
import java.util.List; |
5 | 5 |
import java.util.Map; |
6 |
import java.util.function.Predicate; |
|
6 | 7 |
|
7 | 8 |
import com.google.common.collect.Sets; |
8 | 9 |
|
... | ... | |
22 | 23 |
@Override |
23 | 24 |
public Collection<String> apply(List<Field> fields) { |
24 | 25 |
Collection<String> c = Sets.newLinkedHashSet(); |
25 |
for(Field f : fields) { |
|
26 |
c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist)); |
|
27 |
} |
|
26 |
fields.stream().filter(f -> !f.isEmpty()).forEach(f -> c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist))); |
|
28 | 27 |
return c; |
29 | 28 |
} |
30 | 29 |
|
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/Clustering.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.clustering; |
2 | 2 |
|
3 | 3 |
public enum Clustering { |
4 |
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase |
|
4 |
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
|
|
5 | 5 |
} |
Also available in: Unified diff
added URL clustering