forked from xianhu/PSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
58 lines (44 loc) · 1.95 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# _*_ coding: utf-8 _*_
"""
test.py by xianhu
"""
import spider
import logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s\t%(levelname)s\t%(message)s")
def test_spider():
"""
test spider
"""
# 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个
fetcher = spider.Fetcher(max_repeat=3, sleep_time=0)
parser = spider.Parser(max_deep=1)
saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w"))
# 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况
black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", )
white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360))\.(com|cn)", )
url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000)
# url_filter.update([])
# 初始化WebSpider
web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)
# 首先抓取一次豌豆荚页面, 抓取完成之后不停止monitor
web_spider.set_start_url("http://www.wandoujia.com/apps")
web_spider.start_work_and_wait_done(fetcher_num=10, is_over=False)
# 然后抓取360应用商店页面, 抓取完成之后停止monitor
web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0)
web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True)
return
def test_spider_async():
"""
test spider with asyncio
"""
# 初始化WebSpiderAsync
web_spider_async = spider.WebSpiderAsync(max_repeat=3, sleep_time=0, max_deep=1, save_pipe=open("out_spider_async.txt", "w"), url_filter=spider.UrlFilter())
# 添加种子Url
web_spider_async.set_start_url("http://zhushou.360.cn/")
# 开始抓取任务并等待其结束
web_spider_async.start_work_and_wait_done(fetcher_num=10)
return
if __name__ == "__main__":
test_spider()
test_spider_async()
exit()