Revision 53194
Added by Claudio Atzori over 5 years ago
UrlClustering.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.pace.clustering; |
2 | 2 |
|
3 |
import com.google.common.collect.Sets; |
|
3 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
4 |
import eu.dnetlib.pace.model.Field; |
|
4 | 5 |
|
5 | 6 |
import java.net.MalformedURLException; |
6 | 7 |
import java.net.URL; |
7 | 8 |
import java.util.Collection; |
9 |
import java.util.HashSet; |
|
10 |
import java.util.List; |
|
8 | 11 |
import java.util.Map; |
12 |
import java.util.stream.Collectors; |
|
9 | 13 |
|
10 |
public class UrlClustering extends AbstractClusteringFunction { |
|
14 |
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
|
11 | 15 |
|
12 |
public UrlClustering(Map<String, Integer> params) { |
|
13 |
super(params); |
|
16 |
protected Map<String, Integer> params; |
|
17 |
|
|
18 |
public UrlClustering(final Map<String, Integer> params) { |
|
19 |
this.params = params; |
|
14 | 20 |
} |
15 | 21 |
|
16 | 22 |
@Override |
17 |
protected Collection<String> doApply(String s) { |
|
18 |
return Sets.newHashSet(asUrl(s).getHost()); |
|
23 |
public Collection<String> apply(List<Field> fields) { |
|
24 |
return fields.stream() |
|
25 |
.filter(f -> !f.isEmpty()) |
|
26 |
.map(Field::stringValue) |
|
27 |
.map(this::asUrl) |
|
28 |
.map(URL::getHost) |
|
29 |
.collect(Collectors.toCollection(HashSet::new)); |
|
19 | 30 |
} |
20 | 31 |
|
32 |
@Override |
|
33 |
public Map<String, Integer> getParams() { |
|
34 |
return null; |
|
35 |
} |
|
36 |
|
|
21 | 37 |
private URL asUrl(final String value) { |
22 | 38 |
try { |
23 | 39 |
return new URL(value); |
Also available in: Unified diff
added URL clustering