-
Notifications
You must be signed in to change notification settings - Fork 0
/
1.js
65 lines (55 loc) · 1.82 KB
/
1.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
const Crawler = require('crawler');
const cherio = require('cherio');
const fs = require('fs');
var array = [];
var total = 0;
const c = new Crawler({
maxConnections: 20,
rateLimit: 10,
// This will be called for each crawled page
callback: (error, res, done) => {
if (error) {
console.log(error);
} else {
const $ = res.$;
const url = res.request.uri.href;
const id = url.substring(url.lastIndexOf('/') + 1).replace('dkp-','');
console.log(id);
const x = $('script[type="application/ld+json"]');
if (x != null){
const data = x[0]['children'][0].data;
const json = JSON.parse(data);
const images = json.image;
if (images.length > 0) {
const l = images[images.length-1];
var one = l.substring(0, l.indexOf('?'))
var two = one.substring(one.lastIndexOf('/') + 1);
array.push(two);
} else {
console.log("* skipping: ", res.request.uri.href);
}
} else {
console.log("* skipping: ", res.request.uri.href);
}
}
total -= 1;
console.log("remaining: ", total);
if (total == 0) {
fs.writeFile("objects.json", JSON.stringify(array), function(err) {
if (err) {
console.log(err);
}
});
}
done();
}
});
try {
var text = fs.readFileSync('products.txt', 'utf8').toString();
var links = text.split(",");
total = links.length;
console.log("\n\n" + links.length + " products to crawl..." + "\n");
c.queue(links);
} catch(e) {
console.log('Error:', e.stack);
}