1 |
39920
|
sandro.lab
|
import logging
|
2 |
|
|
import re
|
3 |
|
|
|
4 |
|
|
import libxml2
|
5 |
|
|
|
6 |
|
|
p = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)")
|
7 |
|
|
pattern = re.compile(r'\s+')
|
8 |
|
|
log = logging.getLogger('dli')
|
9 |
|
|
|
10 |
|
|
|
11 |
|
|
class DLIRelation(object):
|
12 |
|
|
def __init__(self, node, ctx):
|
13 |
|
|
|
14 |
|
|
ctx.setContextNode(node)
|
15 |
|
|
nodes = ctx.xpathEval("./*[local-name()='dnetIdentifier']")
|
16 |
|
|
for entity in nodes:
|
17 |
|
|
self.relatedDnetId = re.sub(pattern, ' ', entity.content)
|
18 |
|
|
|
19 |
|
|
nodes = ctx.xpathEval("./*[local-name()='entitytype']")
|
20 |
|
|
for entity in nodes:
|
21 |
|
|
self.relatedEntityType = re.sub(pattern, ' ', entity.content)
|
22 |
|
|
|
23 |
|
|
nodes = ctx.xpathEval("./*[local-name()='typeOfRelation']")
|
24 |
|
|
for entity in nodes:
|
25 |
|
|
self.typeOfRelation = re.sub(pattern, ' ', entity.content)
|
26 |
|
|
|
27 |
|
|
nodes = ctx.xpathEval("./*[local-name()='title']")
|
28 |
|
|
for entity in nodes:
|
29 |
|
|
self.related_title = re.sub(pattern, ' ', entity.content)
|
30 |
|
|
|
31 |
|
|
nodes = ctx.xpathEval("./*[local-name()='pid']")
|
32 |
|
|
for entity in nodes:
|
33 |
|
|
self.targetPID = re.sub(pattern, ' ', entity.content)
|
34 |
|
|
for property in entity.properties:
|
35 |
|
|
if property.name == 'type':
|
36 |
|
|
self.targetPIDType = property.content
|
37 |
|
|
self.authors = []
|
38 |
|
|
nodes = ctx.xpathEval(".//*[local-name()='author']")
|
39 |
|
|
for entity in nodes:
|
40 |
|
|
self.authors.append(re.sub(pattern, ' ', entity.content).strip())
|
41 |
|
|
self.relation_provenance = []
|
42 |
|
|
|
43 |
|
|
nodes = ctx.xpathEval("./*[local-name()='relationProvenance']/*[local-name()='datasource']")
|
44 |
|
|
for entity in nodes:
|
45 |
|
|
rel_item = {}
|
46 |
|
|
rel_item['name'] = re.sub(pattern, ' ', entity.content).strip()
|
47 |
|
|
for property in entity.properties:
|
48 |
|
|
if property.name == 'completionStatus':
|
49 |
|
|
rel_item['completionStatus'] = property.content
|
50 |
|
|
elif property.name == 'provisionMode':
|
51 |
|
|
rel_item['provisionMode'] = property.content
|
52 |
|
|
elif property.name == 'collectionDate':
|
53 |
|
|
rel_item['collectionDate'] = property.content
|
54 |
|
|
self.relation_provenance.append(rel_item)
|
55 |
|
|
|
56 |
|
|
|
57 |
|
|
class DLIObject(object):
|
58 |
|
|
def __init__(self, input_xml):
|
59 |
|
|
log.debug("CREATED OBJECT")
|
60 |
|
|
self.initialize_from_xml(input_xml)
|
61 |
|
|
|
62 |
|
|
def _associate_identifier(self, ctxt):
|
63 |
|
|
res = ctxt.xpathEval("./*[local-name()='dnetResourceIdentifier']")
|
64 |
|
|
for node in res:
|
65 |
|
|
self.identifier = node.content
|
66 |
|
|
|
67 |
|
|
def initialize_from_xml(self, input_xml):
|
68 |
|
|
log.debug("Parsing input %s" % input_xml)
|
69 |
|
|
doc = libxml2.parseDoc(input_xml)
|
70 |
|
|
ctxt = doc.xpathNewContext()
|
71 |
|
|
res = ctxt.xpathEval("//*[local-name()='dliObject']")
|
72 |
|
|
if len(res) == 0:
|
73 |
|
|
log.error("Unable to create DLI object the dli_object node is null")
|
74 |
|
|
return None
|
75 |
|
|
dli_object_node = res[0]
|
76 |
|
|
ctxt.setContextNode(dli_object_node)
|
77 |
|
|
self.identifier = ""
|
78 |
|
|
self.pid = ""
|
79 |
|
|
self.pid_type = ""
|
80 |
|
|
self.resolved_url = ""
|
81 |
|
|
self.completionStatus = ""
|
82 |
|
|
self.provenance_record = []
|
83 |
|
|
self.objectType = "unknown"
|
84 |
|
|
self.title = ""
|
85 |
|
|
self.date = ""
|
86 |
|
|
self.authors = []
|
87 |
|
|
self.relations = []
|
88 |
|
|
self._associate_identifier(ctxt)
|
89 |
|
|
self._associate_local_PID(ctxt)
|
90 |
|
|
self._associate_complete_status(ctxt)
|
91 |
|
|
self._associate_record_provenance(ctxt)
|
92 |
|
|
ctxt.setContextNode(dli_object_node)
|
93 |
|
|
self._associate_type(ctxt)
|
94 |
|
|
self._associate_title(ctxt)
|
95 |
|
|
self._associate_date(ctxt)
|
96 |
|
|
self._associate_authors(ctxt)
|
97 |
|
|
ctxt.setContextNode(dli_object_node)
|
98 |
|
|
self._associate_relations(ctxt, doc.xpathNewContext())
|
99 |
|
|
doc.freeDoc()
|
100 |
|
|
ctxt.xpathFreeContext()
|
101 |
|
|
|
102 |
|
|
def _associate_local_PID(self, ctxt):
|
103 |
|
|
res = ctxt.xpathEval("./*[local-name()='originalIdentifier']")
|
104 |
|
|
for node in res:
|
105 |
|
|
self.pid = node.content.strip()
|
106 |
|
|
for prop in node.properties:
|
107 |
|
|
if prop.name == "type":
|
108 |
|
|
self.pid_type = prop.content
|
109 |
|
|
elif prop.name == "resolvedUrl":
|
110 |
|
|
self.resolved_url = prop.content
|
111 |
|
|
if self.resolved_url == "#":
|
112 |
|
|
if p.match(self.pid):
|
113 |
|
|
self.resolved_url = self.pid
|
114 |
|
|
|
115 |
|
|
def _associate_complete_status(self, ctxt):
|
116 |
|
|
res = ctxt.xpathEval("./*[local-name()='completionStatus']")
|
117 |
|
|
for node in res:
|
118 |
|
|
self.completionStatus = node.content.strip()
|
119 |
|
|
|
120 |
|
|
def _associate_record_provenance(self, ctxt):
|
121 |
|
|
res = ctxt.xpathEval("./*[local-name()='provenance']")
|
122 |
|
|
for node in res:
|
123 |
|
|
ctxt.setContextNode(node)
|
124 |
|
|
datasourcesInfo = ctxt.xpathEval(".//*[local-name()='datasourceInfo']")
|
125 |
|
|
for datasourceInfo in datasourcesInfo:
|
126 |
|
|
ctxt.setContextNode(datasourceInfo)
|
127 |
|
|
datasources = ctxt.xpathEval("./*[local-name()='datasource']")
|
128 |
|
|
for datasource in datasources:
|
129 |
|
|
item = {}
|
130 |
|
|
item['name'] = datasource.content.strip()
|
131 |
|
|
item['name'] = re.sub(pattern, ' ', item['name'])
|
132 |
|
|
for property in datasource.properties:
|
133 |
|
|
if property.name == "completionStatus":
|
134 |
|
|
item['completionStatus'] = property.content.strip()
|
135 |
|
|
elif property.name == "provisionMode":
|
136 |
|
|
item['provisionMode'] = property.content.strip()
|
137 |
|
|
|
138 |
|
|
ctxt.setContextNode(node)
|
139 |
|
|
collection_date_nodes = ctxt.xpathEval(".//*[local-name()='collectionDate']")
|
140 |
|
|
for coll_node in collection_date_nodes:
|
141 |
|
|
item['collectionDate'] = coll_node.content
|
142 |
|
|
self.provenance_record.append(item)
|
143 |
|
|
|
144 |
|
|
def _associate_type(self, ctxt):
|
145 |
|
|
res = ctxt.xpathEval("./*[local-name()='objectType']")
|
146 |
|
|
for node in res:
|
147 |
|
|
self.objectType = node.content.strip()
|
148 |
|
|
if self.objectType == "":
|
149 |
|
|
self.objectType = "unknown"
|
150 |
|
|
|
151 |
|
|
def _associate_title(self, ctxt):
|
152 |
|
|
res = ctxt.xpathEval("./*[local-name()='title']")
|
153 |
|
|
for node in res:
|
154 |
|
|
self.title = node.content.strip()
|
155 |
|
|
self.title = re.sub(pattern, ' ', self.title)
|
156 |
|
|
|
157 |
|
|
def _associate_date(self, ctxt):
|
158 |
|
|
res = ctxt.xpathEval("./*[local-name()='date']")
|
159 |
|
|
for node in res:
|
160 |
|
|
self.date = node.content.strip()
|
161 |
|
|
|
162 |
|
|
def _associate_authors(self, ctxt):
|
163 |
|
|
res = ctxt.xpathEval("./*[local-name()='authors']")
|
164 |
|
|
for node in res:
|
165 |
|
|
ctxt.setContextNode(node)
|
166 |
|
|
authors = ctxt.xpathEval("./*[local-name()='author']")
|
167 |
|
|
for author in authors:
|
168 |
|
|
self.authors.append(re.sub(pattern, ' ', author.content))
|
169 |
|
|
|
170 |
|
|
def _associate_relations(self, ctxt, newCtxt):
|
171 |
|
|
res = ctxt.xpathEval("//*[local-name()='relation']")
|
172 |
|
|
for relation in res:
|
173 |
|
|
self.relations.append(DLIRelation(relation, newCtxt))
|