1
|
import logging
|
2
|
import re
|
3
|
|
4
|
import libxml2
|
5
|
|
6
|
p = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)")
|
7
|
pattern = re.compile(r'\s+')
|
8
|
log = logging.getLogger('dli')
|
9
|
|
10
|
|
11
|
class DLIRelation(object):
|
12
|
def __init__(self, node, ctx):
|
13
|
|
14
|
ctx.setContextNode(node)
|
15
|
nodes = ctx.xpathEval("./*[local-name()='dnetIdentifier']")
|
16
|
for entity in nodes:
|
17
|
self.relatedDnetId = re.sub(pattern, ' ', entity.content)
|
18
|
|
19
|
nodes = ctx.xpathEval("./*[local-name()='entitytype']")
|
20
|
for entity in nodes:
|
21
|
self.relatedEntityType = re.sub(pattern, ' ', entity.content)
|
22
|
|
23
|
nodes = ctx.xpathEval("./*[local-name()='typeOfRelation']")
|
24
|
for entity in nodes:
|
25
|
self.typeOfRelation = re.sub(pattern, ' ', entity.content)
|
26
|
|
27
|
nodes = ctx.xpathEval("./*[local-name()='title']")
|
28
|
for entity in nodes:
|
29
|
self.related_title = re.sub(pattern, ' ', entity.content)
|
30
|
|
31
|
nodes = ctx.xpathEval("./*[local-name()='pid']")
|
32
|
for entity in nodes:
|
33
|
self.targetPID = re.sub(pattern, ' ', entity.content)
|
34
|
for property in entity.properties:
|
35
|
if property.name == 'type':
|
36
|
self.targetPIDType = property.content
|
37
|
self.authors = []
|
38
|
nodes = ctx.xpathEval(".//*[local-name()='author']")
|
39
|
for entity in nodes:
|
40
|
self.authors.append(re.sub(pattern, ' ', entity.content).strip())
|
41
|
self.relation_provenance = []
|
42
|
|
43
|
nodes = ctx.xpathEval("./*[local-name()='relationProvenance']/*[local-name()='datasource']")
|
44
|
for entity in nodes:
|
45
|
rel_item = {}
|
46
|
rel_item['name'] = re.sub(pattern, ' ', entity.content).strip()
|
47
|
for property in entity.properties:
|
48
|
if property.name == 'completionStatus':
|
49
|
rel_item['completionStatus'] = property.content
|
50
|
elif property.name == 'provisionMode':
|
51
|
rel_item['provisionMode'] = property.content
|
52
|
elif property.name == 'collectionDate':
|
53
|
rel_item['collectionDate'] = property.content
|
54
|
self.relation_provenance.append(rel_item)
|
55
|
|
56
|
|
57
|
class DLIObject(object):
|
58
|
def __init__(self, input_xml):
|
59
|
log.debug("CREATED OBJECT")
|
60
|
self.initialize_from_xml(input_xml)
|
61
|
|
62
|
def _associate_identifier(self, ctxt):
|
63
|
res = ctxt.xpathEval("./*[local-name()='dnetResourceIdentifier']")
|
64
|
for node in res:
|
65
|
self.identifier = node.content
|
66
|
|
67
|
def initialize_from_xml(self, input_xml):
|
68
|
log.debug("Parsing input %s" % input_xml)
|
69
|
doc = libxml2.parseDoc(input_xml)
|
70
|
ctxt = doc.xpathNewContext()
|
71
|
res = ctxt.xpathEval("//*[local-name()='dliObject']")
|
72
|
if len(res) == 0:
|
73
|
log.error("Unable to create DLI object the dli_object node is null")
|
74
|
return None
|
75
|
dli_object_node = res[0]
|
76
|
ctxt.setContextNode(dli_object_node)
|
77
|
self.identifier = ""
|
78
|
self.pid = ""
|
79
|
self.pid_type = ""
|
80
|
self.resolved_url = ""
|
81
|
self.completionStatus = ""
|
82
|
self.provenance_record = []
|
83
|
self.objectType = "unknown"
|
84
|
self.title = ""
|
85
|
self.date = ""
|
86
|
self.authors = []
|
87
|
self.relations = []
|
88
|
self._associate_identifier(ctxt)
|
89
|
self._associate_local_PID(ctxt)
|
90
|
self._associate_complete_status(ctxt)
|
91
|
self._associate_record_provenance(ctxt)
|
92
|
ctxt.setContextNode(dli_object_node)
|
93
|
self._associate_type(ctxt)
|
94
|
self._associate_title(ctxt)
|
95
|
self._associate_date(ctxt)
|
96
|
self._associate_authors(ctxt)
|
97
|
ctxt.setContextNode(dli_object_node)
|
98
|
self._associate_relations(ctxt, doc.xpathNewContext())
|
99
|
doc.freeDoc()
|
100
|
ctxt.xpathFreeContext()
|
101
|
|
102
|
def _associate_local_PID(self, ctxt):
|
103
|
res = ctxt.xpathEval("./*[local-name()='originalIdentifier']")
|
104
|
for node in res:
|
105
|
self.pid = node.content.strip()
|
106
|
for prop in node.properties:
|
107
|
if prop.name == "type":
|
108
|
self.pid_type = prop.content
|
109
|
elif prop.name == "resolvedUrl":
|
110
|
self.resolved_url = prop.content
|
111
|
if self.resolved_url == "#":
|
112
|
if p.match(self.pid):
|
113
|
self.resolved_url = self.pid
|
114
|
|
115
|
def _associate_complete_status(self, ctxt):
|
116
|
res = ctxt.xpathEval("./*[local-name()='completionStatus']")
|
117
|
for node in res:
|
118
|
self.completionStatus = node.content.strip()
|
119
|
|
120
|
def _associate_record_provenance(self, ctxt):
|
121
|
res = ctxt.xpathEval("./*[local-name()='provenance']")
|
122
|
for node in res:
|
123
|
ctxt.setContextNode(node)
|
124
|
datasourcesInfo = ctxt.xpathEval(".//*[local-name()='datasourceInfo']")
|
125
|
for datasourceInfo in datasourcesInfo:
|
126
|
ctxt.setContextNode(datasourceInfo)
|
127
|
datasources = ctxt.xpathEval("./*[local-name()='datasource']")
|
128
|
for datasource in datasources:
|
129
|
item = {}
|
130
|
item['name'] = datasource.content.strip()
|
131
|
item['name'] = re.sub(pattern, ' ', item['name'])
|
132
|
for property in datasource.properties:
|
133
|
if property.name == "completionStatus":
|
134
|
item['completionStatus'] = property.content.strip()
|
135
|
elif property.name == "provisionMode":
|
136
|
item['provisionMode'] = property.content.strip()
|
137
|
|
138
|
ctxt.setContextNode(node)
|
139
|
collection_date_nodes = ctxt.xpathEval(".//*[local-name()='collectionDate']")
|
140
|
for coll_node in collection_date_nodes:
|
141
|
item['collectionDate'] = coll_node.content
|
142
|
self.provenance_record.append(item)
|
143
|
|
144
|
def _associate_type(self, ctxt):
|
145
|
res = ctxt.xpathEval("./*[local-name()='objectType']")
|
146
|
for node in res:
|
147
|
self.objectType = node.content.strip()
|
148
|
if self.objectType == "":
|
149
|
self.objectType = "unknown"
|
150
|
|
151
|
def _associate_title(self, ctxt):
|
152
|
res = ctxt.xpathEval("./*[local-name()='title']")
|
153
|
for node in res:
|
154
|
self.title = node.content.strip()
|
155
|
self.title = re.sub(pattern, ' ', self.title)
|
156
|
|
157
|
def _associate_date(self, ctxt):
|
158
|
res = ctxt.xpathEval("./*[local-name()='date']")
|
159
|
for node in res:
|
160
|
self.date = node.content.strip()
|
161
|
|
162
|
def _associate_authors(self, ctxt):
|
163
|
res = ctxt.xpathEval("./*[local-name()='authors']")
|
164
|
for node in res:
|
165
|
ctxt.setContextNode(node)
|
166
|
authors = ctxt.xpathEval("./*[local-name()='author']")
|
167
|
for author in authors:
|
168
|
self.authors.append(re.sub(pattern, ' ', author.content))
|
169
|
|
170
|
def _associate_relations(self, ctxt, newCtxt):
|
171
|
res = ctxt.xpathEval("//*[local-name()='relation']")
|
172
|
for relation in res:
|
173
|
self.relations.append(DLIRelation(relation, newCtxt))
|