1
|
from elasticsearch import *
|
2
|
from elasticsearch_dsl import *
|
3
|
from os import path
|
4
|
import os
|
5
|
from eu.dnetlib.util import get_index_properties
|
6
|
from elasticsearch_dsl.response import Response
|
7
|
import logging
|
8
|
|
9
|
log = logging.getLogger("scholexplorer")
|
10
|
|
11
|
class ScholixConnector(object):
|
12
|
|
13
|
__instance = None
|
14
|
|
15
|
def __new__(cls):
|
16
|
if ScholixConnector.__instance is None:
|
17
|
ScholixConnector.__instance = object.__new__(cls)
|
18
|
props = get_index_properties()
|
19
|
index_name = props['api.index']+"_scholix"
|
20
|
index_host = [x for x in props['es_index'].split(',')]
|
21
|
#connections.create_connection(hosts=index_host, timeout=1000)
|
22
|
ScholixConnector.__instance.connection_pool = ConnectionPool([(Elasticsearch(hosts=index_host, timeout=1000),{}) for x in range(10)])
|
23
|
ScholixConnector.__instance.index_host = index_host
|
24
|
ScholixConnector.__instance.index_name = index_name
|
25
|
return ScholixConnector.__instance
|
26
|
|
27
|
def create_pidType_query(self, value, start):
|
28
|
args = {start + '.identifier.schema': value}
|
29
|
return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
|
30
|
|
31
|
def create_pid_query(self, value, start):
|
32
|
args = {start + '.identifier.identifier': value.lower()}
|
33
|
return Q('nested', path=start + '.identifier', query=Q('bool', must=[Q('match', **args)]))
|
34
|
|
35
|
def create_typology_query(self, value, start):
|
36
|
args = {start + '.objectType': value}
|
37
|
return Q('nested', path=start, query=Q('bool', must=[Q('match', **args)]))
|
38
|
|
39
|
def create_dataSource_query(self, value):
|
40
|
args = {'linkprovider.name': value}
|
41
|
return Q('nested', path='linkprovider', query=Q('bool', must=[Q('match', **args)]))
|
42
|
|
43
|
def create_publisher_query(self, value, start):
|
44
|
args = {start + '.publisher.name': value}
|
45
|
q = Q('nested', path=start + '.publisher', query=Q('bool', must=[Q('match', **args)]))
|
46
|
return Q('nested', path=start, query=q)
|
47
|
|
48
|
def list_datasources(self, ds_name=None):
|
49
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix')
|
50
|
if ds_name:
|
51
|
search_object = search_object.query(self.create_dataSource_query(ds_name))
|
52
|
else:
|
53
|
search_object = search_object.query()
|
54
|
search_object.aggs.bucket('all_datasources', 'nested', path='linkprovider').bucket('all_names', 'terms',
|
55
|
field='linkprovider.name',
|
56
|
size=100)
|
57
|
|
58
|
response = search_object.execute()
|
59
|
if ds_name:
|
60
|
for item in response.aggs.all_datasources.all_names.buckets:
|
61
|
if item.key == ds_name:
|
62
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
63
|
|
64
|
else:
|
65
|
for item in response.aggs.all_datasources.all_names.buckets:
|
66
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
67
|
|
68
|
|
69
|
def list_publisher(self, start, pub_name=None):
|
70
|
log.info("Started Index from host")
|
71
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name)
|
72
|
if pub_name:
|
73
|
search_object = search_object.query(self.create_publisher_query(pub_name, start))
|
74
|
else:
|
75
|
search_object = search_object.query()
|
76
|
search_object.aggs.bucket('all_targets', 'nested', path=start).bucket('all_t_pubs', 'nested',
|
77
|
path=start + '.publisher').bucket(
|
78
|
'all_pubs', 'terms',
|
79
|
field=start + '.publisher.name',
|
80
|
size=1000)
|
81
|
|
82
|
response = search_object.execute()
|
83
|
for item in response.aggs.all_targets.all_t_pubs.all_pubs.buckets:
|
84
|
if pub_name and item.key == pub_name:
|
85
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
86
|
else:
|
87
|
yield dict(name=item.key, totalRelationships=item.doc_count)
|
88
|
|
89
|
|
90
|
|
91
|
def links(self, provider=None, s_pid=None, t_pid=None, s_publisher=None, t_publisher=None, s_pid_type=None,
|
92
|
t_pid_type=None, target_Type=None, source_Type=None,page=0):
|
93
|
queries = []
|
94
|
if provider:
|
95
|
log.info("PROVIDER NOT NONE: {}".format(provider))
|
96
|
queries.append(self.create_dataSource_query(provider))
|
97
|
if s_pid:
|
98
|
log.info("S_PID NOT NONE: {}".format(s_pid))
|
99
|
queries.append(self.create_pid_query(s_pid, 'source'))
|
100
|
if t_pid:
|
101
|
queries.append(self.create_pid_query(t_pid, 'target'))
|
102
|
if s_publisher:
|
103
|
queries.append(self.create_publisher_query(s_publisher, 'source'))
|
104
|
if t_publisher:
|
105
|
queries.append(self.create_publisher_query(t_publisher, 'target'))
|
106
|
if s_pid_type:
|
107
|
queries.append(self.create_pidType_query(s_pid_type, 'source'))
|
108
|
if t_pid_type:
|
109
|
queries.append(self.create_pidType_query(t_pid_type, 'target'))
|
110
|
if target_Type:
|
111
|
if 'literature' == target_Type:
|
112
|
target_Type = 'publication'
|
113
|
queries.append(self.create_typology_query(target_Type,'target'))
|
114
|
if source_Type:
|
115
|
if 'literature' == source_Type:
|
116
|
source_Type = 'publication'
|
117
|
queries.append(self.create_typology_query(source_Type,'source'))
|
118
|
|
119
|
|
120
|
q = None
|
121
|
for item in queries:
|
122
|
if not q:
|
123
|
q = item
|
124
|
else:
|
125
|
q = q & item
|
126
|
log.debug("REQUEST CREATED {}".format(q))
|
127
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(q)
|
128
|
log.debug("Page request size is {}".format(page))
|
129
|
if page > 9999:
|
130
|
return []
|
131
|
|
132
|
return search_object[page:page + 100].execute()
|
133
|
|
134
|
|
135
|
def realtionToPid(self, pid, pidType=None, datasource=None, typology=None, page=0):
|
136
|
if pidType:
|
137
|
query = self.create_pid_pidType_query(pidType.lower(), pid.lower())
|
138
|
else:
|
139
|
query = self.create_source_pid_query(pid.lower())
|
140
|
filters = []
|
141
|
if datasource and len(datasource):
|
142
|
filters.append(self.create_dataSource_query(datasource))
|
143
|
if typology and len(typology):
|
144
|
filters.append(self.create_typology_query(typology,'target'))
|
145
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(query)
|
146
|
|
147
|
if len(filters):
|
148
|
search_object = search_object.filter(Q('bool', must=filters))
|
149
|
if page > 9999:
|
150
|
return []
|
151
|
return search_object[page:page + 100].execute()
|
152
|
|
153
|
def realtionToTypology(self, typology, page=0):
|
154
|
search_object = Search(using=self.connection_pool.get_connection(), index=self.index_name).doc_type('scholix').query(
|
155
|
self.create_typology_query(typology,'target'))
|
156
|
if page > 9999:
|
157
|
return []
|
158
|
return search_object[page:page + 100].execute()
|