Project

General

Profile

1
from elasticsearch import *
2
from elasticsearch_dsl import *
3
from os import path
4
import os
5
from eu.dnetlib.util import get_index_properties
6
from elasticsearch_dsl.response import Response
7
import logging
8

    
9
log = logging.getLogger("scholexplorer")
10

    
11
class ScholixConnector(object):    
12

    
13
    __instance = None
14

    
15
    def __new__(cls):
16
        if ScholixConnector.__instance is None:
17
            ScholixConnector.__instance = object.__new__(cls)        
18
            props = get_index_properties()
19
            index_name = props['api.index']+"_scholix"
20
            index_host = [x for x in props['es_index'].split(',')]   
21
            #connections.create_connection(hosts=index_host, timeout=1000)    
22
            ScholixConnector.__instance.connection_pool = ConnectionPool([(Elasticsearch(hosts=index_host, timeout=1000),{}) for x in range(10)]) 
23
            ScholixConnector.__instance.index_host = index_host
24
            ScholixConnector.__instance.index_name = index_name
25
        return ScholixConnector.__instance
26

    
27
    def create_pidType_query(self, value, start):
28
        args = {start + '.identifier.schema': value}
29
        return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
30

    
31
    def create_pid_query(self, value, start):
32
        args = {start + '.identifier.identifier': value.lower()}
33
        return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
34

    
35
    def create_typology_query(self, value, start):
36
        args = {start + '.objectType': value}
37
        return Q('nested', path=start, query=Q('bool', must=[Q('match', **args)]))
38

    
39
    def create_dataSource_query(self, value):
40
        args = {'linkprovider.name': value}
41
        return Q('nested', path='linkprovider', query=Q('bool', must=[Q('match', **args)]))
42

    
43
    def create_publisher_query(self, value, start):
44
        args = {start + '.publisher.name': value}
45
        q = Q('nested', path=start + '.publisher', query=Q('bool', must=[Q('match', **args)]))
46
        return Q('nested', path=start, query=q)
47

    
48
    def list_datasources(self, ds_name=None):
49
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix')
50
        if ds_name:
51
            search_object = search_object.query(self.create_dataSource_query(ds_name))
52
        else:
53
            search_object = search_object.query()
54
        search_object.aggs.bucket('all_datasources', 'nested', path='linkprovider').bucket('all_names', 'terms',
55
                                                                                           field='linkprovider.name',
56
                                                                                           size=100)
57

    
58
        response = search_object.execute()
59
        if ds_name:
60
            for item in response.aggs.all_datasources.all_names.buckets:
61
                if item.key == ds_name:
62
                    yield dict(name=item.key, totalRelationships=item.doc_count)
63

    
64
        else:
65
            for item in response.aggs.all_datasources.all_names.buckets:
66
                yield dict(name=item.key, totalRelationships=item.doc_count)    
67
        
68

    
69
    def list_publisher(self, start, pub_name=None):
70
        log.info("Started Index from host")
71
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name)
72
        if pub_name:
73
            search_object = search_object.query(self.create_publisher_query(pub_name, start))
74
        else:
75
            search_object = search_object.query()
76
        search_object.aggs.bucket('all_targets', 'nested', path=start).bucket('all_t_pubs', 'nested',
77
                                                                              path=start + '.publisher').bucket(
78
            'all_pubs', 'terms',
79
            field=start + '.publisher.name',
80
            size=1000)
81

    
82
        response = search_object.execute()        
83
        for item in response.aggs.all_targets.all_t_pubs.all_pubs.buckets:
84
            if pub_name and item.key == pub_name:            
85
                yield dict(name=item.key, totalRelationships=item.doc_count)
86
            else:
87
                yield dict(name=item.key, totalRelationships=item.doc_count)
88
            
89
        
90

    
91
    def links(self, provider=None, s_pid=None, t_pid=None, s_publisher=None, t_publisher=None, s_pid_type=None,
92
              t_pid_type=None, target_Type=None, source_Type=None,page=0):       
93
        queries = []
94
        if provider:
95
            log.info("PROVIDER NOT NONE: {}".format(provider))
96
            queries.append(self.create_dataSource_query(provider))
97
        if s_pid:
98
            log.info("S_PID NOT NONE: {}".format(s_pid))
99
            queries.append(self.create_pid_query(s_pid, 'source'))
100
        if t_pid:
101
            queries.append(self.create_pid_query(t_pid, 'target'))
102
        if s_publisher:
103
            queries.append(self.create_publisher_query(s_publisher, 'source'))
104
        if t_publisher:
105
            queries.append(self.create_publisher_query(t_publisher, 'target'))
106
        if s_pid_type:
107
            queries.append(self.create_pidType_query(s_pid_type, 'source'))
108
        if t_pid_type:
109
            queries.append(self.create_pidType_query(t_pid_type, 'target'))
110
        if target_Type:
111
            if 'literature' == target_Type:
112
                target_Type = 'publication'
113
            queries.append(self.create_typology_query(target_Type,'target'))
114
        if source_Type:
115
            if 'literature' == source_Type:
116
                source_Type = 'publication'
117
            queries.append(self.create_typology_query(source_Type,'source'))
118

    
119
        
120
        q = None
121
        for item in queries:
122
            if not q:
123
                q = item
124
            else:
125
                q = q & item        
126
        log.debug("REQUEST CREATED {}".format(q))
127
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(q)
128
        print (search_object.to_dict())
129
        log.debug("Page request size is {}".format(page))
130
        if page > 9999:
131
            return []
132

    
133
        return search_object[page:page + 100].execute()
134

    
135

    
136
    def realtionToPid(self, pid, pidType=None, datasource=None, typology=None, page=0):
137
        if pidType:
138
            query = self.create_pid_pidType_query(pidType.lower(), pid.lower())
139
        else:
140
            query = self.create_source_pid_query(pid.lower())
141
        filters = []
142
        if datasource and len(datasource):
143
            filters.append(self.create_dataSource_query(datasource))
144
        if typology and len(typology):
145
            filters.append(self.create_typology_query(typology,'target'))
146
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(query)
147

    
148
        if len(filters):
149
            search_object = search_object.filter(Q('bool', must=filters))
150
            if page > 9999:
151
                return []
152
        return search_object[page:page + 100].execute()
153

    
154
    def realtionToTypology(self, typology, page=0):
155
        search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(
156
            self.create_typology_query(typology,'target'))
157
        if page > 9999:
158
            return []
159
        return search_object[page:page + 100].execute()
(1-1/5)