Project

General

Profile

1 55686 sandro.lab
from elasticsearch import *
2
from elasticsearch_dsl import *
3
from os import path
4
import os
5
from eu.dnetlib.util import get_index_properties
6
from elasticsearch_dsl.response import Response
7
import logging
8
9
log = logging.getLogger("scholexplorer")
10
11
class ScholixConnector(object):
12
13
    __instance = None
14
15
    def __new__(cls):
16
        if ScholixConnector.__instance is None:
17
            ScholixConnector.__instance = object.__new__(cls)
18
            props = get_index_properties()
19 58279 sandro.lab
            index_name = props['api.index']+"_scholix"
20
            index_host = [x for x in props['es_index'].split(',')]
21 56193 sandro.lab
            #connections.create_connection(hosts=index_host, timeout=1000)
22 58279 sandro.lab
            ScholixConnector.__instance.connection_pool = ConnectionPool([(Elasticsearch(hosts=index_host, timeout=1000),{}) for x in range(10)])
23 55765 sandro.lab
            ScholixConnector.__instance.index_host = index_host
24 55686 sandro.lab
            ScholixConnector.__instance.index_name = index_name
25
        return ScholixConnector.__instance
26
27
    def create_pidType_query(self, value, start):
28
        args = {start + '.identifier.schema': value}
29
        return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
30
31
    def create_pid_query(self, value, start):
32
        args = {start + '.identifier.identifier': value.lower()}
33
        return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
34
35 57982 sandro.lab
    def create_typology_query(self, value, start):
36
        args = {start + '.objectType': value}
37
        return Q('nested', path=start, query=Q('bool', must=[Q('match', **args)]))
38 55686 sandro.lab
39
    def create_dataSource_query(self, value):
40
        args = {'linkprovider.name': value}
41
        return Q('nested', path='linkprovider', query=Q('bool', must=[Q('match', **args)]))
42
43
    def create_publisher_query(self, value, start):
44
        args = {start + '.publisher.name': value}
45
        q = Q('nested', path=start + '.publisher', query=Q('bool', must=[Q('match', **args)]))
46
        return Q('nested', path=start, query=q)
47
48
    def list_datasources(self, ds_name=None):
49 58279 sandro.lab
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix')
50 55686 sandro.lab
        if ds_name:
51
            search_object = search_object.query(self.create_dataSource_query(ds_name))
52
        else:
53
            search_object = search_object.query()
54
        search_object.aggs.bucket('all_datasources', 'nested', path='linkprovider').bucket('all_names', 'terms',
55
                                                                                           field='linkprovider.name',
56
                                                                                           size=100)
57
58
        response = search_object.execute()
59
        if ds_name:
60
            for item in response.aggs.all_datasources.all_names.buckets:
61
                if item.key == ds_name:
62
                    yield dict(name=item.key, totalRelationships=item.doc_count)
63
64
        else:
65
            for item in response.aggs.all_datasources.all_names.buckets:
66
                yield dict(name=item.key, totalRelationships=item.doc_count)
67
68
69
    def list_publisher(self, start, pub_name=None):
70
        log.info("Started Index from host")
71 58279 sandro.lab
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name)
72 55686 sandro.lab
        if pub_name:
73
            search_object = search_object.query(self.create_publisher_query(pub_name, start))
74
        else:
75
            search_object = search_object.query()
76
        search_object.aggs.bucket('all_targets', 'nested', path=start).bucket('all_t_pubs', 'nested',
77
                                                                              path=start + '.publisher').bucket(
78
            'all_pubs', 'terms',
79
            field=start + '.publisher.name',
80
            size=1000)
81
82
        response = search_object.execute()
83
        for item in response.aggs.all_targets.all_t_pubs.all_pubs.buckets:
84
            if pub_name and item.key == pub_name:
85
                yield dict(name=item.key, totalRelationships=item.doc_count)
86
            else:
87
                yield dict(name=item.key, totalRelationships=item.doc_count)
88
89
90
91
    def links(self, provider=None, s_pid=None, t_pid=None, s_publisher=None, t_publisher=None, s_pid_type=None,
92 57982 sandro.lab
              t_pid_type=None, target_Type=None, source_Type=None,page=0):
93 55686 sandro.lab
        queries = []
94
        if provider:
95
            log.info("PROVIDER NOT NONE: {}".format(provider))
96
            queries.append(self.create_dataSource_query(provider))
97
        if s_pid:
98
            log.info("S_PID NOT NONE: {}".format(s_pid))
99
            queries.append(self.create_pid_query(s_pid, 'source'))
100
        if t_pid:
101
            queries.append(self.create_pid_query(t_pid, 'target'))
102
        if s_publisher:
103
            queries.append(self.create_publisher_query(s_publisher, 'source'))
104
        if t_publisher:
105
            queries.append(self.create_publisher_query(t_publisher, 'target'))
106
        if s_pid_type:
107
            queries.append(self.create_pidType_query(s_pid_type, 'source'))
108
        if t_pid_type:
109 56909 sandro.lab
            queries.append(self.create_pidType_query(t_pid_type, 'target'))
110 55686 sandro.lab
        if target_Type:
111 58092 sandro.lab
            if 'literature' == target_Type:
112
                target_Type = 'publication'
113 57982 sandro.lab
            queries.append(self.create_typology_query(target_Type,'target'))
114
        if source_Type:
115 58092 sandro.lab
            if 'literature' == source_Type:
116
                source_Type = 'publication'
117
            queries.append(self.create_typology_query(source_Type,'source'))
118 57982 sandro.lab
119 58092 sandro.lab
120 55686 sandro.lab
        q = None
121
        for item in queries:
122
            if not q:
123
                q = item
124
            else:
125 58092 sandro.lab
                q = q & item
126 55686 sandro.lab
        log.debug("REQUEST CREATED {}".format(q))
127 58279 sandro.lab
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(q)
128 58515 sandro.lab
        print (search_object.to_dict())
129 55686 sandro.lab
        log.debug("Page request size is {}".format(page))
130
        if page > 9999:
131
            return []
132
133
        return search_object[page:page + 100].execute()
134
135
136
    def realtionToPid(self, pid, pidType=None, datasource=None, typology=None, page=0):
137
        if pidType:
138
            query = self.create_pid_pidType_query(pidType.lower(), pid.lower())
139
        else:
140
            query = self.create_source_pid_query(pid.lower())
141
        filters = []
142
        if datasource and len(datasource):
143
            filters.append(self.create_dataSource_query(datasource))
144
        if typology and len(typology):
145 57982 sandro.lab
            filters.append(self.create_typology_query(typology,'target'))
146 58279 sandro.lab
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(query)
147 55686 sandro.lab
148
        if len(filters):
149
            search_object = search_object.filter(Q('bool', must=filters))
150
            if page > 9999:
151
                return []
152
        return search_object[page:page + 100].execute()
153
154
    def realtionToTypology(self, typology, page=0):
155 58279 sandro.lab
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(
156 57982 sandro.lab
            self.create_typology_query(typology,'target'))
157 55686 sandro.lab
        if page > 9999:
158
            return []
159
        return search_object[page:page + 100].execute()