-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.js
79 lines (70 loc) · 2.45 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import jsdom from "jsdom";
import fs from "fs";
import path from 'path';
import { CSVFile } from "./CSVFile.js";
const { JSDOM } = jsdom;
const dataFilePath = path.resolve('data.csv');
function speechExsist(url) {
const content = fs.readFileSync(dataFilePath);
return content.includes(url);
}
(async () => {
const csvFile = new CSVFile({
path: dataFilePath,
headers: true
});
let count = 0;
const start = 1;
const end = 108;
const lang = "hi";
for (let page = start; page < end; page++) {
const url = `https://www.narendramodi.in/speech/loadspeeche?page=${page}&language=${lang}`;
try {
console.log(url);
const dom = (await JSDOM.fromURL(url));
const document = dom.window.document;
const speechesBox = Array.from(document.querySelectorAll(".speechesBox"));
for (let index = 0; index < speechesBox.length; index++) {
const speech = speechesBox[index];
const { href, innerHTML: title } = speech.querySelector(".speechesItemLink.left_class a");
const { innerHTML: date } = speech.querySelector(".pwdBy");
const { src } = speech.querySelector("img");
if (speechExsist(href)) {
console.log("Saved speech found -", href, " Exiting now.");
return;
}
const info = await getSpeechInfo(href);
const speechData = {
href,
title: title.trim(),
date: date.trim(),
img: src,
...info
};
count++;
await csvFile.append([speechData]);
}
} catch (error) {
console.error(error);
}
}
console.log(count, " speech added");
})()
async function getSpeechInfo(url) {
console.log("Speech info: ", url);
const dom = await JSDOM.fromURL(url);
const document = dom.window.document;
const articleBody = document.querySelector(".articleBody");
const youtubeURL = articleBody.querySelector("iframe").src;
const nodes = Array.from(articleBody.querySelectorAll("p"));
const texts = [];
nodes.forEach(p => {
const text = p.textContent ? p.textContent.trim() : ""
if (text.length)
texts.push(text)
})
return {
youtubeURL,
speechText: texts.join(" "),
}
}