Project

General

Profile

« Previous | Next » 

Revision 61368

[Services | Angular 11]: Merge from trunk

View differences:

modules/uoa-services-portal/branches/angular-11/services/sitemaps/extractUrlsFromSearch.ts
1
'use strict';
2

  
3
import {properties} from "../../explore/src/environments/environment";
4
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
5
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
6

  
7
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
8
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
9
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
10

  
11
const request = require('superagent');
12

  
13
function get(resultsPerUrl) {
14
  setTimeout(() => {
15
    let searchFields = new SearchFields();
16
    let fieldIdsMap = searchFields.RESULT_FIELDS;
17

  
18
    request.get(refineUrl, async function (err: any, refineResponse: any) {
19
      if (!refineResponse && err) {
20
        console.error("Error getting refine filters ",err);
21
      } else {
22
        let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
23
        console.log("number of keys: " + keys.length);
24

  
25
        let allUrls = new Set();
26

  
27
        let promiseArray = [];
28

  
29
        for (let key of keys) {
30
          if(key == "community") {
31
            await communitiesPromise;
32
          }
33
          console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
34

  
35
          for (let value of refineResponse.body['refineResults'][key]) {
36
            if(!value || !value.name || !value.id
37
              || value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
38
              || value.name == "unidentified" || value.name == "Undetermined") {
39
              console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
40
              continue;
41
            }
42

  
43
            if(key=="community") {
44
              let valueId = "";
45
              if(value.id) {
46
                let idArray = value.id.split("||");
47
                if(idArray) {
48
                  valueId = idArray[0];
49
                }
50
              }
51
              if(!valueId || !publicCommunities.includes(valueId)) {
52
                console.log("hidden community: "+valueId);
53
                continue;
54
              }
55
            }
56

  
57
            const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
58

  
59
            await new Promise(resolve => setTimeout(resolve, 500));
60
            promiseArray.push(
61
              new Promise((resolve, reject) => {
62
              request.get(url, function (err: any, response: any) {
63
                if (!response && err) {
64
                  reject(err);
65
                } else {
66
                  parseAllUrls(response, allUrls);
67
                  resolve();
68
                }
69
              })
70
            }).catch(error => {
71
              console.error("Error getting results ", error);
72
              fs.appendFileSync("./"+errorFileName, "no response  "+url+"  ");
73
              fs.appendFileSync("./"+errorFileName, error);
74
              fs.appendFileSync("./"+errorFileName, "\n");
75
            }));
76
          }
77
          console.log("");
78
        }
79

  
80
        await Promise.all(promiseArray);
81
        console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
82

  
83
        fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
84
          if (err) {
85
            return console.log("Error appending in file "+fileName+": ", err);
86
          }
87
          console.timeEnd("total_time");
88
        });
89
      }
90
    })
91
  })
92
}
93
// });
94

  
95
function parseAllUrls(response: any, allUrls: any) {
96
  // let allUrls: any = [];
97

  
98
  let responses: any = response.body['results'];
99
  let searchResearchResultsService: any = new SearchResearchResultsService();
100

  
101
  // if(responses) {
102
  //   let length = Array.isArray(responses) ? responses.length : 1;
103
  //   for (let i = 0; i < length; i++) {
104
  //     let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
105
  //
106
  //     if (resData['pid']) {
107
  //       if (!Array.isArray(resData['pid'])) {
108
  //         if (resData['pid'].classid && resData['pid'].classid == 'doi') {
109
  //           if (resData['pid'].content != '' && resData['pid'].content != null) {
110
  //             console.log("|"+resData['pid'].content+"|   "+(typeof resData['pid'].content));
111
  //             resData['pid'].content.replace("https://doi.org/", "");
112
  //           }
113
  //         }
114
  //       }
115
  //     }
116
  //   }
117
  // }
118

  
119

  
120
  let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
121
  if(searchResults.length < 100 && searchResults.length > 0) {
122
    console.log("num of results: "+searchResults.length + "   " + response.request.url);
123
  }
124

  
125
  if(searchResults.length == 0) {
126
    fs.appendFileSync("./"+errorFileName, response.statusCode+"  "+response.request.url+"/n");
127
  }
128
  for(let j=0; j<searchResults.length; j++) {
129
    let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
130

  
131
    let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
132
    let url;
133
    if(pid && pid.id) {
134
      url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
135
    } else {
136
      url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
137
    }
138
    if(allUrls.has(url)) {
139
      alreadyin++;
140
    } else {
141
      allUrls.add(url);
142
      let urlPre = "\n<url><loc>";
143
      let urlSuf = "</loc></url>";
144
      fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
145
      notin++;
146
    }
147
  }
148
  return allUrls;
149
}
150

  
151

  
152
// function parseAllUrls1(response) {
153
//   let allUrls = [];
154
//
155
//   let responses = response.body['results'];
156
//   let length = Array.isArray(responses) ? responses.length : 1;
157
//
158
//   for (let i = 0; i < length; i++) {
159
//     let p = new parsingFunctions.ParsingFunctions();
160
//     let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
161
//
162
//     let type = "result";
163
//     if (resData['resulttype']) {
164
//       type = resData['resulttype']['classname'];
165
//     }
166
//
167
//     if (resData['pid']) {
168
//       let identifiers = p.parseIdentifiers(resData['pid']);
169
//       let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
170
//
171
//       if(pid && pid.id) {
172
//         allUrls[i] = getUrlByType(type, pid, pid.id);
173
//       } else {
174
//         let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
175
//         allUrls[i] = getUrlByType(type, null, canId);
176
//       }
177
//     } else {
178
//       let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
179
//       allUrls[i] = getUrlByType(type, null, canId);
180
//     }
181
//   }
182
//   return allUrls;
183
// }
184
//
185
function getUrlByType(type: any, pid: any, id: any) {
186
  let parameter: any = "";
187

  
188
  if (type === "publication") {
189
    parameter = "articleId";
190
  } else if (type === "dataset") {
191
    parameter = "datasetId";
192
  } else if (type === "software") {
193
    parameter = "softwareId";
194
  } else if (type === "other") {
195
    parameter = "orpId";
196
  } else {
197
    parameter = "id";
198
  }
199
  if(pid) {
200
    parameter = "pid";
201
  }
202
  return landingPrefix+type+"?"+parameter+"="+id;
203
}
204

  
205
function getCommunities() {
206
  communitiesPromise = new Promise((resolve, reject) => {
207
    request.get(contextUrl, async function (err: any, communitiesResponse: any) {
208
      if (!communitiesResponse && err) {
209
        reject(err);
210
      } else {
211
        const contextsService = new ContextsService();
212
        publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
213
        resolve();
214
      }
215
    })
216
  }).catch(error => console.error("Error getting communities ", error));
217
}
218

  
219
function buildSiteMap(resultsPerUrl) {
220
  console.time("total_time");
221

  
222
  let date = new Date();
223
  fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
224
  errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
225
  console.log("Buiding sitemap in file: "+fileName+"\n");
226

  
227
  let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
228
    "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
229

  
230
  fs.writeFile("./"+fileName, sitemap,  function(err) {
231
    if(err) {
232
      return console.log("Error writing in file "+fileName+": ", err);
233
    }
234
  });
235

  
236
  getCommunities();
237
  get(resultsPerUrl);
238
}
239

  
240

  
241
let fileName;
242
let errorFileName;
243
const fs = require('fs');
244

  
245
let alreadyin = 0;  // duplicate urls
246
let notin= 0;
247

  
248
let communitiesPromise;
249
let publicCommunities = [];
250

  
251
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
252
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
253
const landingPrefix = "https://explore.openaire.eu/search/";
254
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
255

  
256
buildSiteMap(100);
modules/uoa-services-portal/branches/angular-11/services/sitemaps/package.json
1
{
2
  "name": "urls_for_sitemap",
3
  "version": "1.0.0",
4
  "description": "Caching in memory",
5
  "main": "cache.js",
6
  "scripts": {
7
    "start": "PORT=3100 node extractUrlsFromSearch.js"
8
  },
9
  "dependencies": {
10
    "superagent": "^5.0.5"
11
  },
12
  "devDependencies": {
13
    "typescript": "3.2.4",
14
    "@types/node": "^8.0.30"
15
  },
16
  "engines": {
17
    "node": "8.1.x"
18
  },
19
  "author": "Konstantina Galouni <kgalouni@di.uoa.gr>",
20
  "license": "NKUA"
21
}
modules/uoa-services-portal/branches/angular-11/services/sitemaps/tsconfig.json
1
{
2
  "compilerOptions": {
3
    "typeRoots": [
4
      "node_modules/@types"
5
    ],
6
    "noImplicitAny": false,
7
    "lib": [
8
      "es2017",
9
      "dom"
10
    ],
11
    "emitDecoratorMetadata": true,
12
    "experimentalDecorators": true
13
  }
14
}
modules/uoa-services-portal/branches/angular-11/services/sitemaps/run.sh
1
npx ts-node extractUrlsFromSearch.ts
0 2

  

Also available in: Unified diff