-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.py
98 lines (70 loc) · 2.87 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
import json
from collections import defaultdict
# Use as siglas dos estados quando quiser recuperar os munincípios de
# determinado estado:
#
# "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT",
# "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO",
# "RR", "SC", "SP", "SE", "TO"
#
def get_estate_municipalities(acronym : str):
# Site do IBGE com o código dos munincípios
IBGE_URL = "https://www.ibge.gov.br/explica/codigos-dos-municipios.php"
response = requests.get(IBGE_URL)
if response.status_code != 200:
print(f"Erro ao acessar a página. Status code: {response.status_code}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
thead = soup.find("thead", id=acronym)
tbody = thead.find_next_sibling("tbody")
municipalities = {}
# Iterando todos os munincípios listados (linhas)
for row in tbody.find_all("tr"):
columns = row.find_all("td")
# Recuperando o nome do munincícpio e o ID
name = unidecode(columns[0].a.text) # Substitui acentos, cedilha, etc...
id = columns[1].text
municipalities[name] = id
return municipalities
# Procura por determinado conteúdo em publicações
# Use a query_string para filtrar por palavras-chave como "abertura de licitação", "contrato", etc...
def scrap_by_gazettes(territory_ids, published_since="2000-01-01", query_string="", excerpt_size=30, number_of_excerpts=1, size=2000):
# API
BASE_API_URL = "https://queridodiario.ok.org.br/api/"
ENDPOINT = "gazettes"
PARAMS = {
"territory_ids": territory_ids,
"published_since": published_since,
"querystring": query_string,
"excerpt_size": excerpt_size,
"number_of_excertps": number_of_excerpts,
"pre_tags": "",
"post_tags": "",
"size": size,
"sort_by": "relevance"
}
res = requests.get(f"{BASE_API_URL}{ENDPOINT}", params=PARAMS)
return res.json()
# Teste
if __name__ == "__main__":
acronym = "RJ"
query_string = "licitação"
published_since = "2020-01-01"
municipalities_dict = get_estate_municipalities(acronym)
# Todo novo valor do dicionário será inicializado como uma lista
txt_urls = defaultdict(list)
# Fazendo o scraping pra cada munincípio
for id in municipalities_dict.values():
try:
res_json = scrap_by_gazettes(id, published_since, query_string)
except:
continue
print(f"Processando: {id}")
for gazette in res_json.get("gazettes"):
txt_urls[gazette.get("territory_id")].append(gazette.get("txt_url"))
# Salvando tudo num arquivo json
with open(f"data/gazettes_{acronym}.json", "w") as file:
json.dump(dict(txt_urls), file, indent=4)