Revision 50064
Added by Sandro La Bruzzo over 6 years ago
es_connector.py | ||
---|---|---|
1 | 1 |
from json import JSONEncoder |
2 | 2 |
|
3 | 3 |
import sys |
4 |
|
|
5 |
import re |
|
4 | 6 |
from elasticsearch import Elasticsearch |
5 | 7 |
from elasticsearch_dsl import * |
6 | 8 |
|
7 | 9 |
import os |
8 | 10 |
from os import path |
9 | 11 |
|
10 |
|
|
11 | 12 |
pid_resolver = { |
12 | 13 |
"pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s", |
13 | 14 |
"ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s", |
... | ... | |
35 | 36 |
|
36 | 37 |
|
37 | 38 |
def resolveIdentifier(pid, pid_type): |
38 |
if pid_type!= None: |
|
39 |
if pid_type.lower() in pid_resolver: |
|
39 |
|
|
40 |
|
|
41 |
if pid_type != None: |
|
42 |
regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b" |
|
43 |
if re.match(regex,pid): |
|
44 |
print "It should be doi" |
|
45 |
pid_type='doi' |
|
46 |
|
|
47 |
|
|
48 |
|
|
49 |
if pid_type.lower() in pid_resolver: |
|
40 | 50 |
return pid_resolver[pid_type.lower()] % pid |
41 | 51 |
else: |
42 | 52 |
if pid_type.lower() == 'openaire': |
43 |
return "https://www.openaire.eu/search/publication?articleId=%s"%pid.replace('oai:dnet:','')
|
|
53 |
return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
|
|
44 | 54 |
else: |
45 | 55 |
return "http://identifiers.org/%s:%s" % (pid_type, pid) |
46 | 56 |
return "" |
47 | 57 |
|
58 |
|
|
48 | 59 |
def get_property(): |
49 | 60 |
f = open(path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.properties')) |
50 | 61 |
p = {} |
... | ... | |
63 | 74 |
return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)])) |
64 | 75 |
|
65 | 76 |
|
77 |
def create_pid_query(value): |
|
78 |
args = {'localIdentifier.id': value} |
|
79 |
return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)])) |
|
80 |
|
|
81 |
|
|
66 | 82 |
def create_publisher_filter(value): |
67 | 83 |
return Q('match', publisher=value) |
68 | 84 |
|
... | ... | |
93 | 109 |
self.client = Elasticsearch(hosts=self.index_host) |
94 | 110 |
self.index_name = props['api.index'] |
95 | 111 |
|
112 |
def query_by_id(self, id): |
|
113 |
s = Search(using=self.client, index=self.index_name).doc_type('object') |
|
114 |
s = s.query(create_pid_query(id)) |
|
115 |
s.aggs.bucket('typologies', 'terms', field='typology') |
|
116 |
s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms', |
|
117 |
field='datasources.datasourceName') |
|
118 |
s.aggs.bucket('all_publisher', 'terms', field='publisher') |
|
119 |
s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms', |
|
120 |
field='localIdentifier.type') |
|
121 |
response = s.execute() |
|
122 |
|
|
123 |
hits = [] |
|
124 |
|
|
125 |
for index_result in response.hits: |
|
126 |
input_source = index_result.__dict__['_d_'] |
|
127 |
fixed_titles = [] |
|
128 |
|
|
129 |
for ids in input_source.get('localIdentifier', []): |
|
130 |
ds = resolveIdentifier(ids['id'], ids['type']) |
|
131 |
ids['url'] = ds |
|
132 |
for t in input_source.get('title', []): |
|
133 |
if len(t) > 0 and t[0] == '"' and t[-1] == '"': |
|
134 |
fixed_titles.append(t[1:-1]) |
|
135 |
else: |
|
136 |
fixed_titles.append(t) |
|
137 |
input_source['title'] = fixed_titles |
|
138 |
hits.append(input_source) |
|
139 |
|
|
140 |
pid_types = [] |
|
141 |
for tag in response.aggs.all_pids.all_types.buckets: |
|
142 |
pid_types.append(dict(key=tag.key, count=tag.doc_count)) |
|
143 |
|
|
144 |
datasources = [] |
|
145 |
for tag in response.aggs.all_datasources.all_names.buckets: |
|
146 |
datasources.append(dict(key=tag.key, count=tag.doc_count)) |
|
147 |
|
|
148 |
typologies = [] |
|
149 |
for tag in response.aggs.typologies.buckets: |
|
150 |
typologies.append(dict(key=tag.key, count=tag.doc_count)) |
|
151 |
|
|
152 |
publishers = [] |
|
153 |
for tag in response.aggs.all_publisher.buckets: |
|
154 |
if len(tag.key) > 0: |
|
155 |
publishers.append(dict(key=tag.key, count=tag.doc_count)) |
|
156 |
|
|
157 |
return DLIESResponse(total=response.hits.total, |
|
158 |
facet=dict(pid=pid_types, typology=typologies, datasource=datasources, |
|
159 |
publishers=publishers), hits=hits) |
|
160 |
|
|
96 | 161 |
def simple_query(self, textual_query, start=None, end=None, user_filter=None): |
97 | 162 |
s = Search(using=self.client, index=self.index_name).doc_type('object') |
98 | 163 |
q = Q('match', _all=textual_query) |
... | ... | |
135 | 200 |
input_source = index_result.__dict__['_d_'] |
136 | 201 |
fixed_titles = [] |
137 | 202 |
|
138 |
for ids in input_source.get('localIdentifier',[]):
|
|
203 |
for ids in input_source.get('localIdentifier', []):
|
|
139 | 204 |
ds = resolveIdentifier(ids['id'], ids['type']) |
140 | 205 |
ids['url'] = ds |
141 |
for t in input_source.get('title',[]): |
|
206 |
for t in input_source.get('title', []):
|
|
142 | 207 |
if len(t) > 0 and t[0] == '"' and t[-1] == '"': |
143 | 208 |
fixed_titles.append(t[1:-1]) |
144 | 209 |
else: |
... | ... | |
180 | 245 |
hits = [] |
181 | 246 |
|
182 | 247 |
for index_hit in response.hits: |
183 |
hits.append(index_hit.__dict__['_d_']) |
|
248 |
current_item = index_hit.__dict__['_d_'] |
|
249 |
if 'target' in current_item: |
|
250 |
ids = [] |
|
251 |
for item in current_item['target']['identifier']: |
|
252 |
c_it = item |
|
253 |
c_it['url'] = resolveIdentifier(item['identifier'], item['schema']) |
|
254 |
ids .append(c_it) |
|
255 |
current_item['target']['identifier'] = ids |
|
256 |
hits.append(current_item) |
|
184 | 257 |
|
185 | 258 |
return hits |
186 | 259 |
|
... | ... | |
202 | 275 |
input_source = res['_source'] |
203 | 276 |
fixed_titles = [] |
204 | 277 |
for t in input_source.get('title'): |
205 |
if len(t) >0 and t[0]=='"' and t[-1]=='"':
|
|
278 |
if len(t) > 0 and t[0] == '"' and t[-1] == '"':
|
|
206 | 279 |
fixed_titles.append(t[1:-1]) |
207 | 280 |
else: |
208 | 281 |
fixed_titles.append(t) |
209 | 282 |
input_source['title'] = fixed_titles |
210 | 283 |
|
211 |
for ids in input_source.get('localIdentifier',[]):
|
|
284 |
for ids in input_source.get('localIdentifier', []):
|
|
212 | 285 |
ds = resolveIdentifier(ids['id'], ids['type']) |
213 | 286 |
ids['url'] = ds |
214 | 287 |
related_publications = [] |
... | ... | |
221 | 294 |
related_publications = self.related_type(id, 'publication', start) |
222 | 295 |
else: |
223 | 296 |
related_publications = self.related_type(id, 'publication') |
224 |
if len(related_publications) >0 :
|
|
297 |
if len(related_publications) > 0:
|
|
225 | 298 |
rel_source = related_publications[0] |
226 | 299 |
else: |
227 | 300 |
rel_source = {} |
... | ... | |
248 | 321 |
except Exception as e: |
249 | 322 |
print "Error on getting item " |
250 | 323 |
print e |
251 |
print "on line %i"% sys.exc_traceback.tb_lineno |
|
324 |
print "on line %i" % sys.exc_traceback.tb_lineno
|
|
252 | 325 |
return DLIESResponse() |
Also available in: Unified diff
made a new look