Project

General

Profile

« Previous | Next » 

Revision 53193

added URL clustering

View differences:

modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import com.google.common.collect.Sets;
4

  
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.util.Collection;
8
import java.util.Map;
9

  
10
public class UrlClustering extends AbstractClusteringFunction {
11

  
12
    public UrlClustering(Map<String, Integer> params) {
13
        super(params);
14
    }
15

  
16
    @Override
17
    protected Collection<String> doApply(String s) {
18
        return Sets.newHashSet(asUrl(s).getHost());
19
    }
20

  
21
    private URL asUrl(final String value) {
22
        try {
23
            return new URL(value);
24
        } catch (MalformedURLException e) {
25
            // should not happen as checked by pace typing
26
            throw new IllegalStateException("invalid URL: " + value);
27
        }
28
    }
29

  
30
}
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
3 3
import java.util.Collection;
4 4
import java.util.List;
5 5
import java.util.Map;
6
import java.util.function.Predicate;
6 7

  
7 8
import com.google.common.collect.Sets;
8 9

  
......
22 23
	@Override
23 24
	public Collection<String> apply(List<Field> fields) {
24 25
		Collection<String> c = Sets.newLinkedHashSet();
25
		for(Field f : fields) {
26
			c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist));
27
		}
26
		fields.stream().filter(f -> !f.isEmpty()).forEach(f -> c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist)));
28 27
		return c;
29 28
	}
30 29

  
modules/dnet-pace-core/trunk/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
1 1
package eu.dnetlib.pace.clustering;
2 2

  
3 3
public enum Clustering {
4
	acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase
4
	acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
5 5
}

Also available in: Unified diff