Project

General

Profile

« Previous | Next » 

Revision 53685

Added safeguard parsing of produced xml to catch badly escaped illegal characters (eg etf: \001 escaped as ). Changed harvesting thread execution from ThreadExecutor to Thread.Start

View differences:

modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/Utils.java
164 164
		return null;
165 165
	}
166 166

  
167
	public static Boolean validateXml(String xml){
168
		try {
169
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
170
			DocumentBuilder builder = factory.newDocumentBuilder();
171
			InputSource is = new InputSource(new StringReader(xml));
172
			builder.parse(is);
173
			return true;
174
		}catch(Exception ex){
175
			return false;
176
		}
177
	}
178

  
167 179
	public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
168 180

  
169 181
		int skipped = 0;
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/kaggle/KaggleRepositoryIterable.java
117 117
	public void bootstrap() {
118 118
		this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
119 119

  
120
		ExecutorService executor = Executors.newSingleThreadExecutor();
121
		executor.execute(new Harvester());
122
		executor.shutdown();
120
		Thread ft = new Thread(new Harvester() );
121
		ft.start();
122
//		ExecutorService executor = Executors.newSingleThreadExecutor();
123
//		executor.execute(new Harvester());
124
//		executor.shutdown();
123 125
	}
124 126

  
125 127
	@Override
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapIndexRepositoryIterable.java
64 64
	public void bootstrap() {
65 65
		this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
66 66

  
67
		ExecutorService executor = Executors.newSingleThreadExecutor();
68
		executor.execute(new Harvester());
69
		executor.shutdown();
67
		Thread ft = new Thread(new Harvester() );
68
		ft.start();
69
//		ExecutorService executor = Executors.newSingleThreadExecutor();
70
//		executor.execute(new Harvester());
71
//		executor.shutdown();
70 72
	}
71 73

  
72 74
	@Override
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java
111 111
		else {
112 112
			log.debug("building document");
113 113
			xml = this.buildDataset(document);
114
			if (xml == null){
114
			if (!Utils.validateXml(xml)) {
115
				log.debug("xml not valid. setting to empty");
116
				xml = null;
117
			}
118
			if (xml == null) {
115 119
				log.debug("could not build xml. returning empty");
116 120
				xml = DatasetDocument.emptyXml();
117 121
			}

Also available in: Unified diff