Project

General

Profile

1
from json import JSONEncoder
2
import sys
3
import re
4
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6
import logging
7
from eu.dnetlib.util import get_index_properties
8
import traceback
9

    
10
import os
11
from os import path
12

    
13
log = logging.getLogger("scholexplorer-portal")
14

    
15
pid_resolver = {
16
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
17
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
18
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
19
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
20
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
21
    "doi": "http://dx.doi.org/%s",
22
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
23
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
35
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
36
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
37
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
38
    "url": "%s"
39
}
40

    
41

    
42
def resolveIdentifier(pid, pid_type):
43
    if pid_type != None:
44
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
45
        if re.match(regex, pid):
46
            log.debug("It should be doi")
47
            pid_type = 'doi'
48
        if pid_type.lower() in pid_resolver:
49
            return pid_resolver[pid_type.lower()] % pid
50
        else:
51
            if pid_type.lower() == 'openaire':
52
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
53
            elif pid_type.lower() == 'url':
54
                return pid
55
            else:
56
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
57
    return ""
58

    
59

    
60
def create_typology_filter(value):
61
    return Q('match', typology=value)
62

    
63

    
64
def create_pid_type_filter(value):
65
    args = {'localIdentifier.type': value}
66
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
67

    
68

    
69
def create_pid_query(value):
70
    args = {'localIdentifier.id': value}
71
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
72

    
73

    
74
def create_publisher_filter(value):
75
    return Q('match', publisher=value)
76

    
77

    
78
def create_datasource_filter(value):
79
    args = {'datasources.datasourceName': value}
80
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
81

    
82

    
83
class DLIESResponseEncoder(JSONEncoder):
84
    def default(self, o):
85
        return o.__dict__
86

    
87

    
88
class DLIESResponse(object):
89
    def __init__(self, facet=None, total=0, hits=[]):
90
        if facet is None:
91
            facet = dict(pid=[], typology=[], datasource=[])
92
        self.facet = facet
93
        self.total = total
94
        self.hits = hits
95

    
96

    
97
class DLIESConnector(object):
98
    def __init__(self):
99
        props = get_index_properties()
100
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
101
        self.client = Elasticsearch(hosts=self.index_host, timeout=600000)
102
        self.index_name = props['api.index']
103

    
104
    def get_main_page_stats(self):
105
        stats = dict(total=int(Search(using=self.client, index=self.index_name + "_scholix").count() / 2))
106
        for item in ['dataset', 'publication']:
107
            s = Search(using=self.client, index=self.index_name + "_object").query(Q('match', typology=item))
108
            stats[item] = s.count()
109
        return stats
110

    
111
    def query_by_id(self, id):
112
        s = Search(using=self.client, index=self.index_name + "_object")
113
        s = s.query(create_pid_query(id))
114
        s.aggs.bucket('typologies', 'terms', field='typology')
115
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
116
                                                                              field='datasources.datasourceName')
117
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
118
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
119
                                                                           field='localIdentifier.type')
120
        response = s.execute()
121

    
122
        hits = []
123

    
124
        for index_result in response.hits:
125
            input_source = index_result.__dict__['_d_']
126
            fixed_titles = []
127

    
128
            for ids in input_source.get('localIdentifier', []):
129
                ds = resolveIdentifier(ids['id'], ids['type'])
130
                ids['url'] = ds
131
            for t in input_source.get('title', []):
132
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
133
                    fixed_titles.append(t[1:-1])
134
                else:
135
                    fixed_titles.append(t)
136
            input_source['title'] = fixed_titles
137
            hits.append(input_source)
138

    
139
        pid_types = []
140
        for tag in response.aggs.all_pids.all_types.buckets:
141
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
142

    
143
        datasources = []
144
        for tag in response.aggs.all_datasources.all_names.buckets:
145
            datasources.append(dict(key=tag.key, count=tag.doc_count))
146

    
147
        typologies = []
148
        for tag in response.aggs.typologies.buckets:
149
            typologies.append(dict(key=tag.key, count=tag.doc_count))
150

    
151
        publishers = []
152
        for tag in response.aggs.all_publisher.buckets:
153
            if len(tag.key) > 0:
154
                publishers.append(dict(key=tag.key, count=tag.doc_count))
155

    
156
        return DLIESResponse(total=response.hits.total,
157
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
158
                                        publishers=publishers), hits=hits)
159

    
160
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
161
        s = Search(using=self.client, index=self.index_name + "_object")
162
        if not textual_query == '*':
163
            q = Q("multi_match", query=textual_query, fields=['title', 'abstract'])
164
        else:
165
            q = Q()
166
        s.aggs.bucket('typologies', 'terms', field='typology')
167
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
168
                                                                              field='datasources.datasourceName')
169
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
170

    
171
        filter_queries = []
172
        if user_filter is not None and len(user_filter) > 0:
173
            for f in user_filter.split('__'):
174
                filter_key = f.split('_')[0]
175
                filter_value = f.split('_')[1]
176
                if filter_key == 'typology':
177
                    filter_queries.append(create_typology_filter(filter_value))
178
                elif filter_key == 'datasource':
179
                    filter_queries.append(create_datasource_filter(filter_value))
180
                elif filter_key == 'pidtype':
181
                    filter_queries.append(create_pid_type_filter(filter_value))
182
                elif filter_key == 'publisher':
183
                    filter_queries.append(create_publisher_filter(filter_value))
184

    
185
        if len(filter_queries) > 0:
186
            s = s.query(q).filter(Q('bool', must=filter_queries))
187
        else:
188
            s = s.query(q)
189

    
190
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
191
                                                                           field='localIdentifier.type')
192

    
193
        if start is not None:
194
            if end is None:
195
                end = start + 10
196
            s = s[start:end]
197
        response = s.execute()
198

    
199
        hits = []
200

    
201
        print(f"index : {self.index_name}_object")
202
        print(response.hits.total)
203

    
204
        for index_result in response.hits:
205
            input_source = index_result.__dict__['_d_']
206
            fixed_titles = []
207
            # for ids in input_source.get('localIdentifier', []):
208
            #     ds = resolveIdentifier(ids['id'], ids['type'])
209
            #     ids['url'] = ds
210

    
211
            if input_source.get('title', []) is not None:
212
                for t in input_source.get('title', []):
213
                    if len(t) > 0 and t[0] == '"' and t[-1] == '"':
214
                        fixed_titles.append(t[1:-1])
215
                    else:
216
                        fixed_titles.append(t)
217
            else:
218
                fixed_titles.append("title not available")
219
            input_source['title'] = fixed_titles
220
            hits.append(input_source)
221

    
222
        pid_types = []
223
        for tag in response.aggs.all_pids.all_types.buckets:
224
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
225

    
226
        datasources = []
227
        for tag in response.aggs.all_datasources.all_names.buckets:
228
            datasources.append(dict(key=tag.key, count=tag.doc_count))
229

    
230
        typologies = []
231
        for tag in response.aggs.typologies.buckets:
232
            typologies.append(dict(key=tag.key, count=tag.doc_count))
233

    
234
        publishers = []
235
        for tag in response.aggs.all_publisher.buckets:
236
            if len(tag.key) > 0:
237
                publishers.append(dict(key=tag.key, count=tag.doc_count))
238

    
239
        return DLIESResponse(total=s.count(),
240
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
241
                                        publishers=publishers), hits=hits)
242

    
243
    def related_type(self, object_id, object_type, start=None):
244
        args = {'target.objectType': object_type}
245
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
246
        args_id = {'source.dnetIdentifier': object_id}
247
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
248
        s = Search(using=self.client).index(self.index_name + "_scholix").query(query_for_id & query_type)
249
        if start:
250
            s = s[start:start + 10]
251

    
252
        response = s.execute()
253
        hits = []
254
        for index_hit in response.hits:
255
            current_item = index_hit.__dict__['_d_']
256
            if 'target' in current_item:
257
                ids = []
258
                for item in current_item['target']['identifier']:
259
                    c_it = item
260
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
261
                    ids.append(c_it)
262
                current_item['target']['identifier'] = ids
263
            hits.append(current_item)
264

    
265
        return hits
266

    
267
    def fix_collectedFrom(self, source, relation):
268
        if relation is None:
269
            return
270
        relSource = relation.get('source')
271
        collectedFrom = relSource.get('collectedFrom', [])
272
        if collectedFrom is not None:
273
            for coll in collectedFrom:
274
                for d in source['datasources']:
275
                    if d['datasourceName'] == coll['provider']['name']:
276
                        d['provisionMode'] = coll['provisionMode']
277
        return source
278

    
279
    def item_by_id(self, id, type=None, start=None):
280
        try:
281
            res = self.client.get(index=self.index_name + "_object", doc_type="_all", id=id, _source=True)
282
            hits = []
283
            input_source = res['_source']
284
            fixed_titles = []
285
            for t in input_source.get('title', []):
286
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
287
                    fixed_titles.append(t[1:-1])
288
                else:
289
                    fixed_titles.append(t)
290
            input_source['title'] = fixed_titles
291

    
292
            related_publications = []
293
            related_dataset = []
294
            related_unknown = []
295

    
296
            rel_source = None
297
            if input_source.get('relatedPublications') > 0:
298
                if 'publication' == type:
299
                    related_publications = self.related_type(id, 'publication', start)
300
                else:
301
                    related_publications = self.related_type(id, 'publication')
302
                if len(related_publications) > 0:
303
                    rel_source = related_publications[0]
304
                else:
305
                    rel_source = {}
306

    
307
            if input_source.get('relatedDatasets') > 0:
308
                if 'dataset' == type:
309
                    related_dataset = self.related_type(id, 'dataset', start)
310
                else:
311
                    related_dataset = self.related_type(id, 'dataset')
312
                rel_source = related_dataset[0]
313
            if input_source.get('relatedUnknown') > 0:
314
                if 'unknown' == type:
315
                    related_unknown = self.related_type(id, 'unknown', start)
316
                else:
317
                    related_unknown = self.related_type(id, 'unknown')
318
                rel_source = related_unknown[0]
319

    
320
            input_source = self.fix_collectedFrom(input_source, rel_source)
321
            hits.append(input_source)
322

    
323
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
324
                             related_unknown=related_unknown))
325

    
326
            return DLIESResponse(total=1, hits=hits)
327
        except Exception as e:
328
            log.error("Error on getting item ")
329
            log.error(e)
330
            log.error("on line %i" % sys.exc_info)
331
            return DLIESResponse()
(2-2/5)