/modules/dli-service-portal/branches/ES_7/eu/dnetlib/es_connector.py - D-Net - D-Net project tracking tool

dnet50/modules/dli-service-portal/branches/ES_7/eu/dnetlib/es_connector.py @ 61369

       from json import JSONEncoder
       import sys
       import re
       from elasticsearch import Elasticsearch
       from elasticsearch_dsl import *
       import logging
       from eu.dnetlib.util import get_index_properties
       import traceback
       import os
       from os import path
       log = logging.getLogger("scholexplorer-portal")
       pid_resolver = {
           "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
           "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
           "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
           "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
           "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
           "doi": "http://dx.doi.org/%s",
           "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
           "url": "%s"
+      }
       def resolveIdentifier(pid, pid_type):
           if pid_type != None:
               regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
               if re.match(regex, pid):
                   log.debug("It should be doi")
                   pid_type = 'doi'
               if pid_type.lower() in pid_resolver:
                   return pid_resolver[pid_type.lower()] % pid
               else:
                   if pid_type.lower() == 'openaire':
                       return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
                   elif pid_type.lower() == 'url':
                       return pid
                   else:
                       return "http://identifiers.org/%s:%s" % (pid_type, pid)
           return ""
       def create_typology_filter(value):
           return Q('match', typology=value)
       def create_pid_type_filter(value):
           args = {'localIdentifier.type': value}
           return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
       def create_pid_query(value):
           args = {'localIdentifier.id': value}
           return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
       def create_publisher_filter(value):
           return Q('match', publisher=value)
       def create_datasource_filter(value):
           args = {'datasources.datasourceName': value}
           return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
       class DLIESResponseEncoder(JSONEncoder):
           def default(self, o):
               return o.__dict__
       class DLIESResponse(object):
           def __init__(self, facet=None, total=0, hits=[]):
               if facet is None:
                   facet = dict(pid=[], typology=[], datasource=[])
               self.facet = facet
               self.total = total
               self.hits = hits
       class DLIESConnector(object):
           def __init__(self):
               props = get_index_properties()
               self.index_host = [x.strip() for x in props['es_index'].split(',')]
               self.client = Elasticsearch(hosts=self.index_host, timeout=600000)
               self.index_name = props['api.index']
           def get_main_page_stats(self):
               stats = dict(total=int(Search(using=self.client, index=self.index_name + "_scholix").count() / 2))
               for item in ['dataset', 'publication']:
                   s = Search(using=self.client, index=self.index_name + "_object").query(Q('match', typology=item))
                   stats[item] = s.count()
               return stats
           def query_by_id(self, id):
               s = Search(using=self.client, index=self.index_name + "_object")
               s = s.query(create_pid_query(id))
               s.aggs.bucket('typologies', 'terms', field='typology')
               s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
                                                                                     field='datasources.datasourceName')
               s.aggs.bucket('all_publisher', 'terms', field='publisher')
               s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
                                                                                  field='localIdentifier.type')
               response = s.execute()
               hits = []
               for index_result in response.hits:
                   input_source = index_result.__dict__['_d_']
                   fixed_titles = []
                   for ids in input_source.get('localIdentifier', []):
                       ds = resolveIdentifier(ids['id'], ids['type'])
                       ids['url'] = ds
                   for t in input_source.get('title', []):
                       if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                           fixed_titles.append(t[1:-1])
                       else:
                           fixed_titles.append(t)
                   input_source['title'] = fixed_titles
                   hits.append(input_source)
               pid_types = []
               for tag in response.aggs.all_pids.all_types.buckets:
                   pid_types.append(dict(key=tag.key, count=tag.doc_count))
               datasources = []
               for tag in response.aggs.all_datasources.all_names.buckets:
                   datasources.append(dict(key=tag.key, count=tag.doc_count))
               typologies = []
               for tag in response.aggs.typologies.buckets:
                   typologies.append(dict(key=tag.key, count=tag.doc_count))
               publishers = []
               for tag in response.aggs.all_publisher.buckets:
                   if len(tag.key) > 0:
                       publishers.append(dict(key=tag.key, count=tag.doc_count))
               return DLIESResponse(total=response.hits.total,
                                    facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
                                               publishers=publishers), hits=hits)
           def simple_query(self, textual_query, start=None, end=None, user_filter=None):
               s = Search(using=self.client, index=self.index_name + "_object")
               if not textual_query == '*':
                   q = Q("multi_match", query=textual_query, fields=['title', 'abstract'])
               else:
                   q = Q()
               s.aggs.bucket('typologies', 'terms', field='typology')
               s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
                                                                                     field='datasources.datasourceName')
               s.aggs.bucket('all_publisher', 'terms', field='publisher')
               filter_queries = []
               if user_filter is not None and len(user_filter) > 0:
                   for f in user_filter.split('__'):
                       filter_key = f.split('_')[0]
                       filter_value = f.split('_')[1]
                       if filter_key == 'typology':
                           filter_queries.append(create_typology_filter(filter_value))
                       elif filter_key == 'datasource':
                           filter_queries.append(create_datasource_filter(filter_value))
                       elif filter_key == 'pidtype':
                           filter_queries.append(create_pid_type_filter(filter_value))
                       elif filter_key == 'publisher':
                           filter_queries.append(create_publisher_filter(filter_value))
               if len(filter_queries) > 0:
                   s = s.query(q).filter(Q('bool', must=filter_queries))
               else:
                   s = s.query(q)
               s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
                                                                                  field='localIdentifier.type')
               if start is not None:
                   if end is None:
                       end = start + 10
                   s = s[start:end]
               response = s.execute()
               hits = []
               print(f"index : {self.index_name}_object")
               print(response.hits.total)
               for index_result in response.hits:
                   input_source = index_result.__dict__['_d_']
                   fixed_titles = []
                   # for ids in input_source.get('localIdentifier', []):
                   #     ds = resolveIdentifier(ids['id'], ids['type'])
                   #     ids['url'] = ds
                   if input_source.get('title', []) is not None:
                       for t in input_source.get('title', []):
                           if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                               fixed_titles.append(t[1:-1])
                           else:
                               fixed_titles.append(t)
                   else:
                       fixed_titles.append("title not available")
                   input_source['title'] = fixed_titles
                   hits.append(input_source)
               pid_types = []
               for tag in response.aggs.all_pids.all_types.buckets:
                   pid_types.append(dict(key=tag.key, count=tag.doc_count))
               datasources = []
               for tag in response.aggs.all_datasources.all_names.buckets:
                   datasources.append(dict(key=tag.key, count=tag.doc_count))
               typologies = []
               for tag in response.aggs.typologies.buckets:
                   typologies.append(dict(key=tag.key, count=tag.doc_count))
               publishers = []
               for tag in response.aggs.all_publisher.buckets:
                   if len(tag.key) > 0:
                       publishers.append(dict(key=tag.key, count=tag.doc_count))
               return DLIESResponse(total=s.count(),
                                    facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
                                               publishers=publishers), hits=hits)
           def related_type(self, object_id, object_type, start=None):
               args = {'target.objectType': object_type}
               query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
               args_id = {'source.dnetIdentifier': object_id}
               query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
               s = Search(using=self.client).index(self.index_name + "_scholix").query(query_for_id & query_type)
               if start:
                   s = s[start:start + 10]
               response = s.execute()
               hits = []
               for index_hit in response.hits:
                   current_item = index_hit.__dict__['_d_']
                   if 'target' in current_item:
                       ids = []
                       for item in current_item['target']['identifier']:
                           c_it = item
                           c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
                           ids.append(c_it)
                       current_item['target']['identifier'] = ids
                   hits.append(current_item)
               return hits
           def fix_collectedFrom(self, source, relation):
               if relation is None:
                   return
               relSource = relation.get('source')
               collectedFrom = relSource.get('collectedFrom', [])
               if collectedFrom is not None:
                   for coll in collectedFrom:
                       for d in source['datasources']:
                           if d['datasourceName'] == coll['provider']['name']:
                               d['provisionMode'] = coll['provisionMode']
               return source
           def item_by_id(self, id, type=None, start=None):
               try:
                   res = self.client.get(index=self.index_name + "_object", doc_type="_all", id=id, _source=True)
                   hits = []
                   input_source = res['_source']
                   fixed_titles = []
                   for t in input_source.get('title', []):
                       if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                           fixed_titles.append(t[1:-1])
                       else:
                           fixed_titles.append(t)
                   input_source['title'] = fixed_titles
                   related_publications = []
                   related_dataset = []
                   related_unknown = []
                   rel_source = None
                   if input_source.get('relatedPublications') > 0:
                       if 'publication' == type:
                           related_publications = self.related_type(id, 'publication', start)
                       else:
                           related_publications = self.related_type(id, 'publication')
                       if len(related_publications) > 0:
                           rel_source = related_publications[0]
                       else:
                           rel_source = {}
                   if input_source.get('relatedDatasets') > 0:
                       if 'dataset' == type:
                           related_dataset = self.related_type(id, 'dataset', start)
                       else:
                           related_dataset = self.related_type(id, 'dataset')
                       rel_source = related_dataset[0]
                   if input_source.get('relatedUnknown') > 0:
                       if 'unknown' == type:
                           related_unknown = self.related_type(id, 'unknown', start)
                       else:
                           related_unknown = self.related_type(id, 'unknown')
                       rel_source = related_unknown[0]
                   input_source = self.fix_collectedFrom(input_source, rel_source)
                   hits.append(input_source)
                   hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
                                    related_unknown=related_unknown))
                   return DLIESResponse(total=1, hits=hits)
               except Exception as e:
                   log.error("Error on getting item ")
                   log.error(e)
                   log.error("on line %i" % sys.exc_info)
                   return DLIESResponse()

(2-2/5)

Project

General

Profile

D-Net