Project

General

Profile

1
package eu.dnetlib.data.mdstore.modular.mongodb.utils;
2

    
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.List;
6
import java.util.Map;
7

    
8
import com.ximpleware.AutoPilot;
9
import com.ximpleware.VTDGen;
10
import com.ximpleware.VTDNav;
11
import eu.dnetlib.data.mdstore.modular.MDFormatDescription;
12
import org.apache.commons.logging.Log;
13
import org.apache.commons.logging.LogFactory;
14

    
15
/**
16
 * Created by sandro on 11/29/16.
17
 */
18
public class IndexFieldRecordParser {
19

    
20
    private static final Log log = LogFactory.getLog(IndexFieldRecordParser.class);
21

    
22
    private static List<String> getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws Exception {
23
        List<String> results = new ArrayList<>();
24
        ap.selectXPath(xpath);
25
        while (ap.evalXPath() != -1) {
26
            int t = vn.getText();
27
            if (t > -1) results.add(vn.toNormalizedString(t));
28
        }
29
        return results;
30
    }
31

    
32
    public Map<String, List<String>> parseRecord(final String record, final List<MDFormatDescription> mdformats) throws IndexFieldRecordParserException {
33
        if (mdformats == null || mdformats.size() == 0)
34
            return null;
35
        final Map<String, List<String>> result = new HashMap<>();
36

    
37
        try {
38
            final VTDGen vg = new VTDGen();
39
            vg.setDoc(record.getBytes());
40
            vg.parse(true);
41
            final VTDNav vn = vg.getNav();
42
            final AutoPilot ap = new AutoPilot(vn);
43

    
44
            for (MDFormatDescription description : mdformats) {
45
                List<String> xpathResult = getTextValue(ap, vn, description.getXpath());
46
                result.put(description.getName(), xpathResult);
47
            }
48
            return result;
49
        } catch (Throwable e) {
50
            throw new IndexFieldRecordParserException("Cannot index record", e);
51
        }
52
    }
53

    
54

    
55
}
(2-2/5)