forked from joelthchao/arxiv-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
65 lines (61 loc) · 2.61 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import urllib
import sqlite3
from BeautifulSoup import *
import re
url = 'http://arxiv.org/list/cs.{}/{}{}?show=1000'
fields = ['CV']
months = ['{:0>2d}'.format(i+1) for i in range(12)]
years = ['{:0>2d}'.format(i) for i in range(6, 17)]
conn = sqlite3.connect('arxiv_raw.sqlite')
cur = conn.cursor()
cur.executescript('''
CREATE TABLE IF NOT EXISTS Papers (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
title TEXT UNIQUE,
url TEXT,
year INTEGER,
month INTEGER
);
CREATE TABLE IF NOT EXISTS Authors (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE IF NOT EXISTS Publications (
paper_id INTEGER,
author_id INTEGER,
PRIMARY KEY (paper_id, author_id)
);
''')
for field in fields:
for year in years:
for month in months:
query_url = url.format(field, year, month)
print 'Retrieving {}'.format(query_url)
uh = urllib.urlopen(query_url)
data = uh.read()
soup = BeautifulSoup(str(data))
titles = soup.findAll('div', {'class': 'list-title'})
authors = soup.findAll('div', {'class': 'list-authors'})
paper_urls = soup.findAll('span', {'class': 'list-identifier'})
if len(titles) != len(authors):
print 'number of titles and authors mismatch'
else:
for title, author, paper_url in zip(titles, authors, paper_urls):
title = title.contents[-1].strip()
paper_url = 'http://arxiv.org' + paper_url.contents[0].attrs[0][1]
cur.execute('''
INSERT OR IGNORE INTO Papers (title, url, year, month)
VALUES (?, ?, ?, ?)''', (title, paper_url, int(year), int(month)))
cur.execute('SELECT id FROM Papers WHERE title = ? ', (title, ))
paper_id = cur.fetchone()[0]
paper_authors = [au.string.strip() for au in author.findAll('a')]
for name in paper_authors:
cur.execute('''
INSERT OR IGNORE INTO Authors (name)
VALUES (?)''', (name, ))
cur.execute('SELECT id FROM Authors WHERE name = ? ', (name, ))
author_id = cur.fetchone()[0]
cur.execute('''
INSERT OR REPLACE INTO Publications
(paper_id, author_id) VALUES (?, ?)''', (paper_id, author_id))
conn.commit()