Revision 61714
Added by Michele Artini over 2 years ago
modules/dnet-isti/trunk/src/main/java/eu/dnetlib/data/mdstore/plugins/FilterMdRecordsPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mdstore.plugins; |
|
2 |
|
|
3 |
import java.io.StringReader; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
import org.dom4j.Document; |
|
9 |
import org.dom4j.DocumentException; |
|
10 |
import org.dom4j.io.SAXReader; |
|
11 |
|
|
12 |
import com.mongodb.BasicDBObject; |
|
13 |
import com.mongodb.DBObject; |
|
14 |
import com.mongodb.client.MongoCollection; |
|
15 |
|
|
16 |
import eu.dnetlib.data.mdstore.modular.mongodb.MongoMDStore; |
|
17 |
import eu.dnetlib.rmi.data.MDStoreServiceException; |
|
18 |
|
|
19 |
public class FilterMdRecordsPlugin extends AbstractIstiMDStorePlugin { |
|
20 |
|
|
21 |
private static final Log log = LogFactory.getLog(FilterMdRecordsPlugin.class); |
|
22 |
|
|
23 |
@Override |
|
24 |
public final void process(final MongoMDStore store, final Map<String, String> params) throws MDStoreServiceException { |
|
25 |
|
|
26 |
final String xpath = params.get("xpath"); |
|
27 |
|
|
28 |
final MongoCollection<DBObject> collPubs = store.getCollection(); |
|
29 |
|
|
30 |
long valid = 0; |
|
31 |
long skipped = 0; |
|
32 |
|
|
33 |
final SAXReader reader = new SAXReader(); |
|
34 |
for (final DBObject obj : collPubs.find()) { |
|
35 |
try { |
|
36 |
final String recordId = obj.get("id").toString(); |
|
37 |
final Document doc = reader.read(new StringReader(obj.get("body").toString())); |
|
38 |
if (doc.selectNodes(xpath).isEmpty()) { |
|
39 |
collPubs.deleteOne(new BasicDBObject("id", recordId)); |
|
40 |
skipped++; |
|
41 |
} else { |
|
42 |
valid++; |
|
43 |
} |
|
44 |
} catch (final DocumentException e) { |
|
45 |
log.warn("Problem parsing a mdstore record", e); |
|
46 |
} |
|
47 |
} |
|
48 |
|
|
49 |
log.info("***** Filtering records *****"); |
|
50 |
log.info("* xpath: " + xpath); |
|
51 |
log.info("* valid: " + valid); |
|
52 |
log.info("* skipped: " + skipped); |
|
53 |
log.info("*****************************"); |
|
54 |
|
|
55 |
touch(store); |
|
56 |
} |
|
57 |
|
|
58 |
} |
modules/dnet-isti/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/common/filterRecords.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="bc0e925e-2beb-44c4-9a91-e81b45392361_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ==" /> |
|
4 |
<RESOURCE_TYPE value="WorkflowTemplateDSResourceType" /> |
|
5 |
<RESOURCE_KIND value="WorkflowTemplateDSResources" /> |
|
6 |
<RESOURCE_URI value="" /> |
|
7 |
<DATE_OF_CREATION value="2021-10-25T11:50:45+02:00" /> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<PARAMETERS> |
|
12 |
<PARAM description="Datasource Id" name="dsId" required="true" type="string"/> |
|
13 |
<PARAM description="Datasource Interface" name="interface" required="true" type="string"/> |
|
14 |
<PARAM description="MDStoreId" name="mdId" required="true" type="string" /> |
|
15 |
<PARAM description="xpath" name="xpath" required="true" type="string" /> |
|
16 |
</PARAMETERS> |
|
17 |
<WORKFLOW> |
|
18 |
<NODE isStart="true" name="filter_records" type="RunMDStorePlugin"> |
|
19 |
<DESCRIPTION>Filter records by XPATH</DESCRIPTION> |
|
20 |
<PARAMETERS> |
|
21 |
<PARAM name="pluginName" value="filterMDstorePlugin" /> |
|
22 |
<PARAM name="parameters"> |
|
23 |
<MAP> |
|
24 |
<ENTRY key="mdId" ref="mdId" /> |
|
25 |
<ENTRY key="xpath" ref="xpath" /> |
|
26 |
</MAP> |
|
27 |
</PARAM> |
|
28 |
</PARAMETERS> |
|
29 |
<ARCS> |
|
30 |
<ARC to="UPDATE_INFO" /> |
|
31 |
</ARCS> |
|
32 |
</NODE> |
|
33 |
|
|
34 |
<NODE name="UPDATE_INFO" type="MDStoreToApiExtraField"> |
|
35 |
<DESCRIPTION>Update datasouce API extra fields</DESCRIPTION> |
|
36 |
<PARAMETERS> |
|
37 |
<PARAM name="mdId" ref="mdId"/> |
|
38 |
<PARAM name="datasourceId" ref="dsId"/> |
|
39 |
<PARAM name="datasourceInterface" ref="interface"/> |
|
40 |
<PARAM name="extraFieldForTotal" value="last_aggregation_total"/> |
|
41 |
<PARAM name="extraFieldForDate" value="last_aggregation_date"/> |
|
42 |
<PARAM name="extraFieldForMdId" value="last_aggregation_mdId"/> |
|
43 |
</PARAMETERS> |
|
44 |
<ARCS> |
|
45 |
<ARC to="success"/> |
|
46 |
</ARCS> |
|
47 |
</NODE> |
|
48 |
</WORKFLOW> |
|
49 |
</CONFIGURATION> |
|
50 |
</BODY> |
|
51 |
</RESOURCE_PROFILE> |
modules/dnet-isti/trunk/src/main/resources/eu/dnetlib/isti/applicationContext-isti.xml | ||
---|---|---|
24 | 24 |
|
25 | 25 |
<bean id="enrichLocalLinksPlugin" class="eu.dnetlib.data.mdstore.plugins.EnrichLocalLinksPlugin" /> |
26 | 26 |
|
27 |
<bean id="filterMDstorePlugin" class="eu.dnetlib.data.mdstore.plugins.FilterMdRecordsPlugin" /> |
|
28 |
|
|
27 | 29 |
<!-- Affiliations DAO --> |
28 | 30 |
<bean id="affiliationsDao" class="eu.dnetlib.data.db.AffiliationsDao"> |
29 | 31 |
<property name="jdbcTemplate"> |
Also available in: Unified diff
filter plugin