Project

General

Profile

1
'use strict';
2

    
3
import {properties} from "../../explore/src/environments/environment";
4
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
5
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
6

    
7
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
8
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
9
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
10

    
11
const request = require('superagent');
12

    
13
function get(resultsPerUrl) {
14
  setTimeout(() => {
15
    let searchFields = new SearchFields();
16
    let fieldIdsMap = searchFields.RESULT_FIELDS;
17

    
18
    request.get(refineUrl, async function (err: any, refineResponse: any) {
19
      if (!refineResponse && err) {
20
        console.error("Error getting refine filters ",err);
21
      } else {
22
        let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
23
        console.log("number of keys: " + keys.length);
24

    
25
        let allUrls = new Set();
26

    
27
        var promiseArray = [];
28

    
29
        for (let key of keys) {
30
          if(key == "community") {
31
            await communitiesPromise;
32
          }
33
          console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
34

    
35
          for (let value of refineResponse.body['refineResults'][key]) {
36
            if(!value || !value.name || !value.id
37
              || value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
38
              || value.name == "unidentified" || value.name == "Undetermined") {
39
              console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
40
              continue;
41
            }
42

    
43
            if(key=="community") {
44
              let valueId = "";
45
              if(value.id) {
46
                let idArray = value.id.split("||");
47
                if(idArray) {
48
                  valueId = idArray[0];
49
                }
50
              }
51
              if(!valueId || !publicCommunities.includes(valueId)) {
52
                console.log("hidden community: "+valueId);
53
                continue;
54
              }
55
            }
56

    
57
            const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
58

    
59
            promiseArray.push(new Promise((resolve, reject) => {
60
              request.get(url, function (err: any, response: any) {
61
                if (!response && err) {
62
                  console.error("Error getting results ", err);
63
                  fs.appendFileSync("./"+errorFileName, "no response  "+url);
64
                  fs.appendFileSync("./"+errorFileName, err);
65
                  fs.appendFileSync("./"+errorFileName, "\n");
66
                  reject();
67
                } else {
68
                  parseAllUrls(response, allUrls);
69
                  resolve();
70
                }
71
              })
72
            }));
73
          }
74
          console.log("");
75
        }
76

    
77
        await Promise.all(promiseArray);
78
        console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
79

    
80
        fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
81
          if (err) {
82
            return console.log("Error appending in file "+fileName+": ", err);
83
          }
84
          console.timeEnd("total_time");
85
        });
86
      }
87
    })
88
  })
89
}
90
// });
91

    
92
function parseAllUrls(response: any, allUrls: any) {
93
  // let allUrls: any = [];
94

    
95
  let responses: any = response.body['results'];
96
  let searchResearchResultsService: any = new SearchResearchResultsService();
97

    
98
  // if(responses) {
99
  //   let length = Array.isArray(responses) ? responses.length : 1;
100
  //   for (let i = 0; i < length; i++) {
101
  //     let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
102
  //
103
  //     if (resData['pid']) {
104
  //       if (!Array.isArray(resData['pid'])) {
105
  //         if (resData['pid'].classid && resData['pid'].classid == 'doi') {
106
  //           if (resData['pid'].content != '' && resData['pid'].content != null) {
107
  //             console.log("|"+resData['pid'].content+"|   "+(typeof resData['pid'].content));
108
  //             resData['pid'].content.replace("https://doi.org/", "");
109
  //           }
110
  //         }
111
  //       }
112
  //     }
113
  //   }
114
  // }
115

    
116

    
117
  let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
118
  if(searchResults.length < 100 && searchResults.length > 0) {
119
    console.log("num of results: "+searchResults.length + "   " + response.request.url);
120
  }
121

    
122
  if(searchResults.length == 0) {
123
    fs.appendFileSync("./"+errorFileName, response.statusCode+"  "+response.request.url+"/n");
124
  }
125
  for(let j=0; j<searchResults.length; j++) {
126
    let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
127

    
128
    let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
129
    let url;
130
    if(pid && pid.id) {
131
      url = getUrlByType(resultPreview.resultType, pid, pid.id);
132
    } else {
133
      url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
134
    }
135
    if(allUrls.has(url)) {
136
      alreadyin++;
137
    } else {
138
      allUrls.add(url);
139
      let urlPre = "<url>\n" +
140
        "    <loc>";
141
      let urlSuf = "</loc>\n" +
142
        "    </url>";
143
      fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
144
      notin++;
145
    }
146
  }
147
  return allUrls;
148
}
149

    
150

    
151
// function parseAllUrls1(response) {
152
//   let allUrls = [];
153
//
154
//   let responses = response.body['results'];
155
//   let length = Array.isArray(responses) ? responses.length : 1;
156
//
157
//   for (let i = 0; i < length; i++) {
158
//     let p = new parsingFunctions.ParsingFunctions();
159
//     let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
160
//
161
//     let type = "result";
162
//     if (resData['resulttype']) {
163
//       type = resData['resulttype']['classname'];
164
//     }
165
//
166
//     if (resData['pid']) {
167
//       let identifiers = p.parseIdentifiers(resData['pid']);
168
//       let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
169
//
170
//       if(pid && pid.id) {
171
//         allUrls[i] = getUrlByType(type, pid, pid.id);
172
//       } else {
173
//         let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
174
//         allUrls[i] = getUrlByType(type, null, canId);
175
//       }
176
//     } else {
177
//       let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
178
//       allUrls[i] = getUrlByType(type, null, canId);
179
//     }
180
//   }
181
//   return allUrls;
182
// }
183
//
184
function getUrlByType(type: any, pid: any, id: any) {
185
  let parameter: any = "";
186

    
187
  if (type === "publication") {
188
    parameter = "articleId";
189
  } else if (type === "dataset") {
190
    parameter = "datasetId";
191
  } else if (type === "software") {
192
    parameter = "softwareId";
193
  } else if (type === "other") {
194
    parameter = "orpId";
195
  } else {
196
    parameter = "id";
197
  }
198
  if(pid) {
199
    parameter = "pid";
200
  }
201
  return landingPrefix+type+"?"+parameter+"="+id;
202
}
203

    
204
function getCommunities() {
205
  communitiesPromise = new Promise((resolve, reject) => {
206
    request.get(contextUrl, async function (err: any, communitiesResponse: any) {
207
      if (!communitiesResponse && err) {
208
        console.error("Error getting communities ", err);
209
        reject();
210
      } else {
211
        const contextsService = new ContextsService();
212
        publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
213
        resolve();
214
      }
215
    })
216
  });
217
}
218

    
219
function buildSiteMap(resultsPerUrl) {
220
  console.time("total_time");
221

    
222
  let date = new Date();
223
  fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
224
  errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
225
  console.log("Buiding sitemap in file: "+fileName+"\n");
226

    
227
  let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
228
    "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">";
229

    
230
  fs.writeFile("./"+fileName, sitemap,  function(err) {
231
    if(err) {
232
      return console.log("Error writing in file "+fileName+": ", err);
233
    }
234
  });
235

    
236
  getCommunities();
237
  get(resultsPerUrl);
238
}
239

    
240

    
241
var fileName;
242
var errorFileName;
243
const fs = require('fs');
244

    
245
var alreadyin = 0;  // duplicate urls
246
var notin= 0;
247

    
248
var communitiesPromise;
249
var publicCommunities = [];
250

    
251
const refineUrl = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
252
const resultsUrlPrefix = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json";
253
const landingPrefix = "https://beta.explore.openaire.eu/search/";
254
const contextUrl = "https://beta.services.openaire.eu/openaire/contexts/";
255

    
256
buildSiteMap(100);
(1-1/4)