|
1 |
'use strict';
|
|
2 |
|
|
3 |
import {properties} from "../../explore/src/environments/environment";
|
|
4 |
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
|
|
5 |
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
|
|
6 |
|
|
7 |
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
|
8 |
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
|
9 |
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
|
10 |
|
|
11 |
const request = require('superagent');
|
|
12 |
|
|
13 |
function get(resultsPerUrl) {
|
|
14 |
setTimeout(() => {
|
|
15 |
let searchFields = new SearchFields();
|
|
16 |
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
|
17 |
|
|
18 |
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
|
19 |
if (!refineResponse && err) {
|
|
20 |
console.error("Error getting refine filters ",err);
|
|
21 |
} else {
|
|
22 |
let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
|
|
23 |
console.log("number of keys: " + keys.length);
|
|
24 |
|
|
25 |
let allUrls = new Set();
|
|
26 |
|
|
27 |
let promiseArray = [];
|
|
28 |
|
|
29 |
for (let key of keys) {
|
|
30 |
if(key == "community") {
|
|
31 |
await communitiesPromise;
|
|
32 |
}
|
|
33 |
console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
|
|
34 |
|
|
35 |
for (let value of refineResponse.body['refineResults'][key]) {
|
|
36 |
if(!value || !value.name || !value.id
|
|
37 |
|| value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
|
|
38 |
|| value.name == "unidentified" || value.name == "Undetermined") {
|
|
39 |
console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
|
|
40 |
continue;
|
|
41 |
}
|
|
42 |
|
|
43 |
if(key=="community") {
|
|
44 |
let valueId = "";
|
|
45 |
if(value.id) {
|
|
46 |
let idArray = value.id.split("||");
|
|
47 |
if(idArray) {
|
|
48 |
valueId = idArray[0];
|
|
49 |
}
|
|
50 |
}
|
|
51 |
if(!valueId || !publicCommunities.includes(valueId)) {
|
|
52 |
console.log("hidden community: "+valueId);
|
|
53 |
continue;
|
|
54 |
}
|
|
55 |
}
|
|
56 |
|
|
57 |
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
|
|
58 |
|
|
59 |
await new Promise(resolve => setTimeout(resolve, 500));
|
|
60 |
promiseArray.push(
|
|
61 |
new Promise((resolve, reject) => {
|
|
62 |
request.get(url, function (err: any, response: any) {
|
|
63 |
if (!response && err) {
|
|
64 |
reject(err);
|
|
65 |
} else {
|
|
66 |
parseAllUrls(response, allUrls);
|
|
67 |
resolve();
|
|
68 |
}
|
|
69 |
})
|
|
70 |
}).catch(error => {
|
|
71 |
console.error("Error getting results ", error);
|
|
72 |
fs.appendFileSync("./"+errorFileName, "no response "+url+" ");
|
|
73 |
fs.appendFileSync("./"+errorFileName, error);
|
|
74 |
fs.appendFileSync("./"+errorFileName, "\n");
|
|
75 |
}));
|
|
76 |
}
|
|
77 |
console.log("");
|
|
78 |
}
|
|
79 |
|
|
80 |
await Promise.all(promiseArray);
|
|
81 |
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
|
82 |
|
|
83 |
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
|
84 |
if (err) {
|
|
85 |
return console.log("Error appending in file "+fileName+": ", err);
|
|
86 |
}
|
|
87 |
console.timeEnd("total_time");
|
|
88 |
});
|
|
89 |
}
|
|
90 |
})
|
|
91 |
})
|
|
92 |
}
|
|
93 |
// });
|
|
94 |
|
|
95 |
function parseAllUrls(response: any, allUrls: any) {
|
|
96 |
// let allUrls: any = [];
|
|
97 |
|
|
98 |
let responses: any = response.body['results'];
|
|
99 |
let searchResearchResultsService: any = new SearchResearchResultsService();
|
|
100 |
|
|
101 |
// if(responses) {
|
|
102 |
// let length = Array.isArray(responses) ? responses.length : 1;
|
|
103 |
// for (let i = 0; i < length; i++) {
|
|
104 |
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
|
105 |
//
|
|
106 |
// if (resData['pid']) {
|
|
107 |
// if (!Array.isArray(resData['pid'])) {
|
|
108 |
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
|
|
109 |
// if (resData['pid'].content != '' && resData['pid'].content != null) {
|
|
110 |
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
|
|
111 |
// resData['pid'].content.replace("https://doi.org/", "");
|
|
112 |
// }
|
|
113 |
// }
|
|
114 |
// }
|
|
115 |
// }
|
|
116 |
// }
|
|
117 |
// }
|
|
118 |
|
|
119 |
|
|
120 |
let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
|
|
121 |
if(searchResults.length < 100 && searchResults.length > 0) {
|
|
122 |
console.log("num of results: "+searchResults.length + " " + response.request.url);
|
|
123 |
}
|
|
124 |
|
|
125 |
if(searchResults.length == 0) {
|
|
126 |
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n");
|
|
127 |
}
|
|
128 |
for(let j=0; j<searchResults.length; j++) {
|
|
129 |
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
|
130 |
|
|
131 |
let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
|
|
132 |
let url;
|
|
133 |
if(pid && pid.id) {
|
|
134 |
url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
|
|
135 |
} else {
|
|
136 |
url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
|
|
137 |
}
|
|
138 |
if(allUrls.has(url)) {
|
|
139 |
alreadyin++;
|
|
140 |
} else {
|
|
141 |
allUrls.add(url);
|
|
142 |
let urlPre = "\n<url><loc>";
|
|
143 |
let urlSuf = "</loc></url>";
|
|
144 |
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
|
145 |
notin++;
|
|
146 |
}
|
|
147 |
}
|
|
148 |
return allUrls;
|
|
149 |
}
|
|
150 |
|
|
151 |
|
|
152 |
// function parseAllUrls1(response) {
|
|
153 |
// let allUrls = [];
|
|
154 |
//
|
|
155 |
// let responses = response.body['results'];
|
|
156 |
// let length = Array.isArray(responses) ? responses.length : 1;
|
|
157 |
//
|
|
158 |
// for (let i = 0; i < length; i++) {
|
|
159 |
// let p = new parsingFunctions.ParsingFunctions();
|
|
160 |
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
|
161 |
//
|
|
162 |
// let type = "result";
|
|
163 |
// if (resData['resulttype']) {
|
|
164 |
// type = resData['resulttype']['classname'];
|
|
165 |
// }
|
|
166 |
//
|
|
167 |
// if (resData['pid']) {
|
|
168 |
// let identifiers = p.parseIdentifiers(resData['pid']);
|
|
169 |
// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
|
|
170 |
//
|
|
171 |
// if(pid && pid.id) {
|
|
172 |
// allUrls[i] = getUrlByType(type, pid, pid.id);
|
|
173 |
// } else {
|
|
174 |
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
|
175 |
// allUrls[i] = getUrlByType(type, null, canId);
|
|
176 |
// }
|
|
177 |
// } else {
|
|
178 |
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
|
179 |
// allUrls[i] = getUrlByType(type, null, canId);
|
|
180 |
// }
|
|
181 |
// }
|
|
182 |
// return allUrls;
|
|
183 |
// }
|
|
184 |
//
|
|
185 |
function getUrlByType(type: any, pid: any, id: any) {
|
|
186 |
let parameter: any = "";
|
|
187 |
|
|
188 |
if (type === "publication") {
|
|
189 |
parameter = "articleId";
|
|
190 |
} else if (type === "dataset") {
|
|
191 |
parameter = "datasetId";
|
|
192 |
} else if (type === "software") {
|
|
193 |
parameter = "softwareId";
|
|
194 |
} else if (type === "other") {
|
|
195 |
parameter = "orpId";
|
|
196 |
} else {
|
|
197 |
parameter = "id";
|
|
198 |
}
|
|
199 |
if(pid) {
|
|
200 |
parameter = "pid";
|
|
201 |
}
|
|
202 |
return landingPrefix+type+"?"+parameter+"="+id;
|
|
203 |
}
|
|
204 |
|
|
205 |
function getCommunities() {
|
|
206 |
communitiesPromise = new Promise((resolve, reject) => {
|
|
207 |
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
|
208 |
if (!communitiesResponse && err) {
|
|
209 |
reject(err);
|
|
210 |
} else {
|
|
211 |
const contextsService = new ContextsService();
|
|
212 |
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
|
213 |
resolve();
|
|
214 |
}
|
|
215 |
})
|
|
216 |
}).catch(error => console.error("Error getting communities ", error));
|
|
217 |
}
|
|
218 |
|
|
219 |
function buildSiteMap(resultsPerUrl) {
|
|
220 |
console.time("total_time");
|
|
221 |
|
|
222 |
let date = new Date();
|
|
223 |
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
|
224 |
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
|
225 |
console.log("Buiding sitemap in file: "+fileName+"\n");
|
|
226 |
|
|
227 |
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
|
228 |
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
|
229 |
|
|
230 |
fs.writeFile("./"+fileName, sitemap, function(err) {
|
|
231 |
if(err) {
|
|
232 |
return console.log("Error writing in file "+fileName+": ", err);
|
|
233 |
}
|
|
234 |
});
|
|
235 |
|
|
236 |
getCommunities();
|
|
237 |
get(resultsPerUrl);
|
|
238 |
}
|
|
239 |
|
|
240 |
|
|
241 |
let fileName;
|
|
242 |
let errorFileName;
|
|
243 |
const fs = require('fs');
|
|
244 |
|
|
245 |
let alreadyin = 0; // duplicate urls
|
|
246 |
let notin= 0;
|
|
247 |
|
|
248 |
let communitiesPromise;
|
|
249 |
let publicCommunities = [];
|
|
250 |
|
|
251 |
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
|
252 |
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
|
253 |
const landingPrefix = "https://explore.openaire.eu/search/";
|
|
254 |
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
|
255 |
|
|
256 |
buildSiteMap(100);
|
[Services | Angular 11]: Merge from trunk