1 |
55686
|
sandro.lab
|
from elasticsearch import *
|
2 |
|
|
from elasticsearch_dsl import *
|
3 |
|
|
from os import path
|
4 |
|
|
import os
|
5 |
|
|
from eu.dnetlib.util import get_index_properties
|
6 |
|
|
from elasticsearch_dsl.response import Response
|
7 |
|
|
import logging
|
8 |
|
|
|
9 |
|
|
log = logging.getLogger("scholexplorer")
|
10 |
|
|
|
11 |
|
|
class ScholixConnector(object):
|
12 |
|
|
|
13 |
|
|
__instance = None
|
14 |
|
|
|
15 |
|
|
def __new__(cls):
|
16 |
|
|
if ScholixConnector.__instance is None:
|
17 |
|
|
ScholixConnector.__instance = object.__new__(cls)
|
18 |
|
|
props = get_index_properties()
|
19 |
58279
|
sandro.lab
|
index_name = props['api.index']+"_scholix"
|
20 |
|
|
index_host = [x for x in props['es_index'].split(',')]
|
21 |
56193
|
sandro.lab
|
#connections.create_connection(hosts=index_host, timeout=1000)
|
22 |
58279
|
sandro.lab
|
ScholixConnector.__instance.connection_pool = ConnectionPool([(Elasticsearch(hosts=index_host, timeout=1000),{}) for x in range(10)])
|
23 |
55765
|
sandro.lab
|
ScholixConnector.__instance.index_host = index_host
|
24 |
55686
|
sandro.lab
|
ScholixConnector.__instance.index_name = index_name
|
25 |
|
|
return ScholixConnector.__instance
|
26 |
|
|
|
27 |
|
|
def create_pidType_query(self, value, start):
|
28 |
|
|
args = {start + '.identifier.schema': value}
|
29 |
|
|
return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
|
30 |
|
|
|
31 |
|
|
def create_pid_query(self, value, start):
|
32 |
|
|
args = {start + '.identifier.identifier': value.lower()}
|
33 |
|
|
return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
|
34 |
|
|
|
35 |
57982
|
sandro.lab
|
def create_typology_query(self, value, start):
|
36 |
|
|
args = {start + '.objectType': value}
|
37 |
|
|
return Q('nested', path=start, query=Q('bool', must=[Q('match', **args)]))
|
38 |
55686
|
sandro.lab
|
|
39 |
|
|
def create_dataSource_query(self, value):
|
40 |
|
|
args = {'linkprovider.name': value}
|
41 |
|
|
return Q('nested', path='linkprovider', query=Q('bool', must=[Q('match', **args)]))
|
42 |
|
|
|
43 |
|
|
def create_publisher_query(self, value, start):
|
44 |
|
|
args = {start + '.publisher.name': value}
|
45 |
|
|
q = Q('nested', path=start + '.publisher', query=Q('bool', must=[Q('match', **args)]))
|
46 |
|
|
return Q('nested', path=start, query=q)
|
47 |
|
|
|
48 |
|
|
def list_datasources(self, ds_name=None):
|
49 |
58279
|
sandro.lab
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix')
|
50 |
55686
|
sandro.lab
|
if ds_name:
|
51 |
|
|
search_object = search_object.query(self.create_dataSource_query(ds_name))
|
52 |
|
|
else:
|
53 |
|
|
search_object = search_object.query()
|
54 |
|
|
search_object.aggs.bucket('all_datasources', 'nested', path='linkprovider').bucket('all_names', 'terms',
|
55 |
|
|
field='linkprovider.name',
|
56 |
|
|
size=100)
|
57 |
|
|
|
58 |
|
|
response = search_object.execute()
|
59 |
|
|
if ds_name:
|
60 |
|
|
for item in response.aggs.all_datasources.all_names.buckets:
|
61 |
|
|
if item.key == ds_name:
|
62 |
|
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
63 |
|
|
|
64 |
|
|
else:
|
65 |
|
|
for item in response.aggs.all_datasources.all_names.buckets:
|
66 |
|
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
67 |
|
|
|
68 |
|
|
|
69 |
|
|
def list_publisher(self, start, pub_name=None):
|
70 |
|
|
log.info("Started Index from host")
|
71 |
58279
|
sandro.lab
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name)
|
72 |
55686
|
sandro.lab
|
if pub_name:
|
73 |
|
|
search_object = search_object.query(self.create_publisher_query(pub_name, start))
|
74 |
|
|
else:
|
75 |
|
|
search_object = search_object.query()
|
76 |
|
|
search_object.aggs.bucket('all_targets', 'nested', path=start).bucket('all_t_pubs', 'nested',
|
77 |
|
|
path=start + '.publisher').bucket(
|
78 |
|
|
'all_pubs', 'terms',
|
79 |
|
|
field=start + '.publisher.name',
|
80 |
|
|
size=1000)
|
81 |
|
|
|
82 |
|
|
response = search_object.execute()
|
83 |
|
|
for item in response.aggs.all_targets.all_t_pubs.all_pubs.buckets:
|
84 |
|
|
if pub_name and item.key == pub_name:
|
85 |
|
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
86 |
|
|
else:
|
87 |
|
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
88 |
|
|
|
89 |
|
|
|
90 |
|
|
|
91 |
|
|
def links(self, provider=None, s_pid=None, t_pid=None, s_publisher=None, t_publisher=None, s_pid_type=None,
|
92 |
57982
|
sandro.lab
|
t_pid_type=None, target_Type=None, source_Type=None,page=0):
|
93 |
55686
|
sandro.lab
|
queries = []
|
94 |
|
|
if provider:
|
95 |
|
|
log.info("PROVIDER NOT NONE: {}".format(provider))
|
96 |
|
|
queries.append(self.create_dataSource_query(provider))
|
97 |
|
|
if s_pid:
|
98 |
|
|
log.info("S_PID NOT NONE: {}".format(s_pid))
|
99 |
|
|
queries.append(self.create_pid_query(s_pid, 'source'))
|
100 |
|
|
if t_pid:
|
101 |
|
|
queries.append(self.create_pid_query(t_pid, 'target'))
|
102 |
|
|
if s_publisher:
|
103 |
|
|
queries.append(self.create_publisher_query(s_publisher, 'source'))
|
104 |
|
|
if t_publisher:
|
105 |
|
|
queries.append(self.create_publisher_query(t_publisher, 'target'))
|
106 |
|
|
if s_pid_type:
|
107 |
|
|
queries.append(self.create_pidType_query(s_pid_type, 'source'))
|
108 |
|
|
if t_pid_type:
|
109 |
56909
|
sandro.lab
|
queries.append(self.create_pidType_query(t_pid_type, 'target'))
|
110 |
55686
|
sandro.lab
|
if target_Type:
|
111 |
58092
|
sandro.lab
|
if 'literature' == target_Type:
|
112 |
|
|
target_Type = 'publication'
|
113 |
57982
|
sandro.lab
|
queries.append(self.create_typology_query(target_Type,'target'))
|
114 |
|
|
if source_Type:
|
115 |
58092
|
sandro.lab
|
if 'literature' == source_Type:
|
116 |
|
|
source_Type = 'publication'
|
117 |
|
|
queries.append(self.create_typology_query(source_Type,'source'))
|
118 |
57982
|
sandro.lab
|
|
119 |
58092
|
sandro.lab
|
|
120 |
55686
|
sandro.lab
|
q = None
|
121 |
|
|
for item in queries:
|
122 |
|
|
if not q:
|
123 |
|
|
q = item
|
124 |
|
|
else:
|
125 |
58092
|
sandro.lab
|
q = q & item
|
126 |
55686
|
sandro.lab
|
log.debug("REQUEST CREATED {}".format(q))
|
127 |
58279
|
sandro.lab
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(q)
|
128 |
58515
|
sandro.lab
|
print (search_object.to_dict())
|
129 |
55686
|
sandro.lab
|
log.debug("Page request size is {}".format(page))
|
130 |
|
|
if page > 9999:
|
131 |
|
|
return []
|
132 |
|
|
|
133 |
|
|
return search_object[page:page + 100].execute()
|
134 |
|
|
|
135 |
|
|
|
136 |
|
|
def realtionToPid(self, pid, pidType=None, datasource=None, typology=None, page=0):
|
137 |
|
|
if pidType:
|
138 |
|
|
query = self.create_pid_pidType_query(pidType.lower(), pid.lower())
|
139 |
|
|
else:
|
140 |
|
|
query = self.create_source_pid_query(pid.lower())
|
141 |
|
|
filters = []
|
142 |
|
|
if datasource and len(datasource):
|
143 |
|
|
filters.append(self.create_dataSource_query(datasource))
|
144 |
|
|
if typology and len(typology):
|
145 |
57982
|
sandro.lab
|
filters.append(self.create_typology_query(typology,'target'))
|
146 |
58279
|
sandro.lab
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(query)
|
147 |
55686
|
sandro.lab
|
|
148 |
|
|
if len(filters):
|
149 |
|
|
search_object = search_object.filter(Q('bool', must=filters))
|
150 |
|
|
if page > 9999:
|
151 |
|
|
return []
|
152 |
|
|
return search_object[page:page + 100].execute()
|
153 |
|
|
|
154 |
|
|
def realtionToTypology(self, typology, page=0):
|
155 |
58279
|
sandro.lab
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(
|
156 |
57982
|
sandro.lab
|
self.create_typology_query(typology,'target'))
|
157 |
55686
|
sandro.lab
|
if page > 9999:
|
158 |
|
|
return []
|
159 |
|
|
return search_object[page:page + 100].execute()
|