1
|
'use strict';
|
2
|
|
3
|
import {properties} from "../../explore/src/environments/environment";
|
4
|
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
|
5
|
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
|
6
|
|
7
|
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
8
|
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
9
|
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
10
|
|
11
|
const request = require('superagent');
|
12
|
|
13
|
function get(resultsPerUrl) {
|
14
|
setTimeout(() => {
|
15
|
let searchFields = new SearchFields();
|
16
|
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
17
|
|
18
|
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
19
|
if (!refineResponse && err) {
|
20
|
console.error("Error getting refine filters ",err);
|
21
|
} else {
|
22
|
let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
|
23
|
console.log("number of keys: " + keys.length);
|
24
|
|
25
|
let allUrls = new Set();
|
26
|
|
27
|
var promiseArray = [];
|
28
|
|
29
|
for (let key of keys) {
|
30
|
if(key == "community") {
|
31
|
await communitiesPromise;
|
32
|
}
|
33
|
console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
|
34
|
|
35
|
for (let value of refineResponse.body['refineResults'][key]) {
|
36
|
if(!value || !value.name || !value.id
|
37
|
|| value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
|
38
|
|| value.name == "unidentified" || value.name == "Undetermined") {
|
39
|
console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
|
40
|
continue;
|
41
|
}
|
42
|
|
43
|
if(key=="community") {
|
44
|
let valueId = "";
|
45
|
if(value.id) {
|
46
|
let idArray = value.id.split("||");
|
47
|
if(idArray) {
|
48
|
valueId = idArray[0];
|
49
|
}
|
50
|
}
|
51
|
if(!valueId || !publicCommunities.includes(valueId)) {
|
52
|
console.log("hidden community: "+valueId);
|
53
|
continue;
|
54
|
}
|
55
|
}
|
56
|
|
57
|
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
|
58
|
|
59
|
promiseArray.push(new Promise((resolve, reject) => {
|
60
|
request.get(url, function (err: any, response: any) {
|
61
|
if (!response && err) {
|
62
|
console.error("Error getting results ", err);
|
63
|
fs.appendFileSync("./"+errorFileName, "no response "+url);
|
64
|
fs.appendFileSync("./"+errorFileName, err);
|
65
|
fs.appendFileSync("./"+errorFileName, "\n");
|
66
|
reject();
|
67
|
} else {
|
68
|
parseAllUrls(response, allUrls);
|
69
|
resolve();
|
70
|
}
|
71
|
})
|
72
|
}));
|
73
|
}
|
74
|
console.log("");
|
75
|
}
|
76
|
|
77
|
await Promise.all(promiseArray);
|
78
|
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
79
|
|
80
|
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
81
|
if (err) {
|
82
|
return console.log("Error appending in file "+fileName+": ", err);
|
83
|
}
|
84
|
console.timeEnd("total_time");
|
85
|
});
|
86
|
}
|
87
|
})
|
88
|
})
|
89
|
}
|
90
|
// });
|
91
|
|
92
|
function parseAllUrls(response: any, allUrls: any) {
|
93
|
// let allUrls: any = [];
|
94
|
|
95
|
let responses: any = response.body['results'];
|
96
|
let searchResearchResultsService: any = new SearchResearchResultsService();
|
97
|
|
98
|
// if(responses) {
|
99
|
// let length = Array.isArray(responses) ? responses.length : 1;
|
100
|
// for (let i = 0; i < length; i++) {
|
101
|
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
102
|
//
|
103
|
// if (resData['pid']) {
|
104
|
// if (!Array.isArray(resData['pid'])) {
|
105
|
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
|
106
|
// if (resData['pid'].content != '' && resData['pid'].content != null) {
|
107
|
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
|
108
|
// resData['pid'].content.replace("https://doi.org/", "");
|
109
|
// }
|
110
|
// }
|
111
|
// }
|
112
|
// }
|
113
|
// }
|
114
|
// }
|
115
|
|
116
|
|
117
|
let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
|
118
|
if(searchResults.length < 100 && searchResults.length > 0) {
|
119
|
console.log("num of results: "+searchResults.length + " " + response.request.url);
|
120
|
}
|
121
|
|
122
|
if(searchResults.length == 0) {
|
123
|
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n");
|
124
|
}
|
125
|
for(let j=0; j<searchResults.length; j++) {
|
126
|
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
127
|
|
128
|
let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
|
129
|
let url;
|
130
|
if(pid && pid.id) {
|
131
|
url = getUrlByType(resultPreview.resultType, pid, pid.id);
|
132
|
} else {
|
133
|
url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
|
134
|
}
|
135
|
if(allUrls.has(url)) {
|
136
|
alreadyin++;
|
137
|
} else {
|
138
|
allUrls.add(url);
|
139
|
let urlPre = "<url>\n" +
|
140
|
" <loc>";
|
141
|
let urlSuf = "</loc>\n" +
|
142
|
" </url>";
|
143
|
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
144
|
notin++;
|
145
|
}
|
146
|
}
|
147
|
return allUrls;
|
148
|
}
|
149
|
|
150
|
|
151
|
// function parseAllUrls1(response) {
|
152
|
// let allUrls = [];
|
153
|
//
|
154
|
// let responses = response.body['results'];
|
155
|
// let length = Array.isArray(responses) ? responses.length : 1;
|
156
|
//
|
157
|
// for (let i = 0; i < length; i++) {
|
158
|
// let p = new parsingFunctions.ParsingFunctions();
|
159
|
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
160
|
//
|
161
|
// let type = "result";
|
162
|
// if (resData['resulttype']) {
|
163
|
// type = resData['resulttype']['classname'];
|
164
|
// }
|
165
|
//
|
166
|
// if (resData['pid']) {
|
167
|
// let identifiers = p.parseIdentifiers(resData['pid']);
|
168
|
// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
|
169
|
//
|
170
|
// if(pid && pid.id) {
|
171
|
// allUrls[i] = getUrlByType(type, pid, pid.id);
|
172
|
// } else {
|
173
|
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
174
|
// allUrls[i] = getUrlByType(type, null, canId);
|
175
|
// }
|
176
|
// } else {
|
177
|
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
178
|
// allUrls[i] = getUrlByType(type, null, canId);
|
179
|
// }
|
180
|
// }
|
181
|
// return allUrls;
|
182
|
// }
|
183
|
//
|
184
|
function getUrlByType(type: any, pid: any, id: any) {
|
185
|
let parameter: any = "";
|
186
|
|
187
|
if (type === "publication") {
|
188
|
parameter = "articleId";
|
189
|
} else if (type === "dataset") {
|
190
|
parameter = "datasetId";
|
191
|
} else if (type === "software") {
|
192
|
parameter = "softwareId";
|
193
|
} else if (type === "other") {
|
194
|
parameter = "orpId";
|
195
|
} else {
|
196
|
parameter = "id";
|
197
|
}
|
198
|
if(pid) {
|
199
|
parameter = "pid";
|
200
|
}
|
201
|
return landingPrefix+type+"?"+parameter+"="+id;
|
202
|
}
|
203
|
|
204
|
function getCommunities() {
|
205
|
communitiesPromise = new Promise((resolve, reject) => {
|
206
|
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
207
|
if (!communitiesResponse && err) {
|
208
|
console.error("Error getting communities ", err);
|
209
|
reject();
|
210
|
} else {
|
211
|
const contextsService = new ContextsService();
|
212
|
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
213
|
resolve();
|
214
|
}
|
215
|
})
|
216
|
});
|
217
|
}
|
218
|
|
219
|
function buildSiteMap(resultsPerUrl) {
|
220
|
console.time("total_time");
|
221
|
|
222
|
let date = new Date();
|
223
|
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
224
|
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
225
|
console.log("Buiding sitemap in file: "+fileName+"\n");
|
226
|
|
227
|
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
228
|
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">";
|
229
|
|
230
|
fs.writeFile("./"+fileName, sitemap, function(err) {
|
231
|
if(err) {
|
232
|
return console.log("Error writing in file "+fileName+": ", err);
|
233
|
}
|
234
|
});
|
235
|
|
236
|
getCommunities();
|
237
|
get(resultsPerUrl);
|
238
|
}
|
239
|
|
240
|
|
241
|
var fileName;
|
242
|
var errorFileName;
|
243
|
const fs = require('fs');
|
244
|
|
245
|
var alreadyin = 0; // duplicate urls
|
246
|
var notin= 0;
|
247
|
|
248
|
var communitiesPromise;
|
249
|
var publicCommunities = [];
|
250
|
|
251
|
const refineUrl = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
252
|
const resultsUrlPrefix = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json";
|
253
|
const landingPrefix = "https://beta.explore.openaire.eu/search/";
|
254
|
const contextUrl = "https://beta.services.openaire.eu/openaire/contexts/";
|
255
|
|
256
|
buildSiteMap(100);
|