-
Notifications
You must be signed in to change notification settings - Fork 0
/
FinalProject_Giang_Alex.py
255 lines (220 loc) · 9.66 KB
/
FinalProject_Giang_Alex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Final Project CIS 117
# Alex Giang
# 19 May 2023
if __name__ == '__main__':
'''
The main method creates the Tkinter window tool Word Frequency Search.
'''
import time
from tkinter import *
from urllib.request import urlopen
from html.parser import HTMLParser
import re
import sqlite3
try:
con = sqlite3.connect('first250.db')
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS freq_search_giang
(title text, report text);
""")
except:
pass
start_time = time.time()
class MyHTMLParser(HTMLParser):
'''
An extended class of HTMLParser with method handle_data overriden and two new methods frequency and dump_data
'''
def __init__(self):
'''
Initializes a new instance of MyHTMLParser.
'''
HTMLParser.__init__(self)
self.title = "none"
self.counts = {}
def handle_data(self, data):
'''
Overrides handle_data in the HTMLParser class and counts words in body text, stripped of punctuation other
than apostrophes, and updates the dictionary self.counts with new terms and increased count values.
'''
import re
data = data.lower()
n = data.find("*** start of the project gutenberg ebook ") + 41
if n >= 41 and self.title == "none":
self.title = re.sub(pattern=r"[*'\r\n;\!\\?\-\",\.\+]", repl='', string=data[n:])
clean = re.sub(pattern=r"[\'\r\n;\!\\?\-\",\.\+]", repl='', string=data)
clean = re.sub(pattern=r'[-—]', repl=' ', string=clean)
words = clean.split(" ")
for word in words:
if word in self.counts.keys():
self.counts[word] += 1
else:
if len(word) > 3:
self.counts[word] = 1
def get_report(self, n_rows):
'''
Returns a table that describes how many times each word appears in the text attribute,
including n_rows/3 rows each with three ranks, words, and counts. Words with less than ten
occurrences are omitted. Words shorter than four letters are omitted.
'''
self.counts = {key: val for key, val in self.counts.items() if val > 10}
keys = list(self.counts.keys())
values = list(self.counts.values())
def argsort(seq):
'''
This helper method takes a sequence and outputs the keys of items in order of ascending value.
'''
return sorted(range(len(seq)), key=seq.__getitem__)
sorted_value_index = argsort(values)
self.counts = {keys[i]: values[i] for i in reversed(sorted_value_index)}
ret = 'Results for book with title ' + self.title.upper() + ':\n'
n = 1
for key, value in self.counts.items():
ret += ("{:<16}{:<5}| ".format((str(n) + '. ' + key), value))
if n >= n_rows:
break
if n % 3 == 0:
ret += '\n'
n += 1
try:
command = "INSERT INTO freq_search_giang VALUES ('" + self.title.upper() + "', '" + ret + "');"
print(command)
cur.execute(command)
con.commit()
except:
pass
return ret
def get_link(query):
'''
This method takes a query for a title, searches it on gutenberg.org, and outputs the HTML5
link of the book that came up as the first result of the search. If the search fails,
it will use the error book - the Declaration of Independence.
'''
query = query.replace(' ', '+')
q_link = "https://www.gutenberg.org/ebooks/search/?query=" + query + "&submit_search=Go%21"
class MyHTMLParser2(HTMLParser):
def __init__(self):
'''
Initializes a new instance of MyHTMLParser.
'''
HTMLParser.__init__(self)
self.book_num = '1'
def handle_starttag(self, tag, attrs):
# Only parse the 'anchor' tag.
if tag == "a":
# Check the list of defined attributes.
for name, value in attrs:
# If href is defined, print it.
if name == "href":
if re.match("/ebooks/[0-9]", value):
self.book_num = value[8:]
if self.book_num != '1':
break
response2 = urlopen(q_link)
html_page2 = response2.read()
html_page2 = html_page2.decode()
parser2 = MyHTMLParser2()
parser2.feed(html_page2)
n = parser2.book_num
link = "https://www.gutenberg.org/cache/epub/" + n + "/pg" + n + ".html"
try:
urlopen(link)
except:
try:
link = "https://www.gutenberg.org/cache/epub/" + n + "/pg" + n + "-images.html"
urlopen(link)
except:
link = 'https://www.gutenberg.org/files/1/1-h/1-h.htm' # give up
return link
def click():
'''
Handles downclicks of buttons by processing the URL in txt (entry box)
opening the URL, parsing the resulting HTML in MyHTMLParser for emails, and counting the
number of qualifying emails and displaying them in the output text box.
'''
output.delete(0.0, END)
query = txt.get()
try:
if -1 < txt2.get().find("http://"):
txt2.insert(0.0, "http://")
urlopen(txt2.get())
except:
if query.find('.') > -1:
link = query # accidentally putting a url in the first text box
else:
link = get_link(query)
if len(query) < 1:
link = 'https://www.gutenberg.org/files/1/1-h/1-h.htm' # give up
else:
link = txt2.get()
if -1 < link.find("http://"):
link = "https://" + link
try:
command = "SELECT * FROM freq_search_giang WHERE title LIKE '%" + query.strip().upper() + "%';"
cur.execute(command)
if len(cur.fetchone()[1]) > 200:
output.insert(END, cur.fetchone()[1])
else:
raise
except:
response = urlopen(link)
html_page = response.read()
html_page = html_page.decode()
parser = MyHTMLParser()
parser.feed(html_page)
parser.get_report(50)
output.insert(END, str(parser.get_report(50)))
# BELOW IS CODE TO LOAD UP THE LOCAL DATABASE, CHANGE THE LOWER BOUND TO 1 AND RUN IT (LOAD TAKES ABOUT 7 MINUTES)
# If first250.db is included, you don't have to run this.
# INCLUDES FIRST 250 PROJECT GUTENBERG BOOKS
for n in range(250, 250):
n = str(n)
try:
link = "https://www.gutenberg.org/cache/epub/" + n + "/pg" + n + ".html"
try:
urlopen(link)
except:
try:
link = "https://www.gutenberg.org/cache/epub/" + n + "/pg" + n + "-images.html"
urlopen(link)
except:
continue
response = urlopen(link)
html_page = response.read()
html_page = html_page.decode()
parser = MyHTMLParser()
parser.feed(html_page)
parser.get_report(50)
except:
continue
# main
win = Tk()
win.title("Word Frequency Search on Book")
win.configure(background='beige')
# headings
Label(win, text="Hello! Hi! Welcome!", bg='beige', font='Modern 48 bold italic').grid(row=1, column=1)
Label(win,
text="Word Frequency Search on Book by Alex Giang\n\nThis program takes a book title as an input,"
"\nopens the book, and outputs a table of the 50\nmost frequently used words in that book!"
"\n\nENTER THE TITLE YOU WOULD LIKE TO SEARCH HERE!", bg='beige', font='Modern 18').grid(row=2, column=1)
# entry box for title
txt = Entry(win, width=60, bg="white")
txt.grid(row=3, column=1)
Label(win, text=" ", bg='beige').grid(row=4, column=1)
# GO (submit) button for title
Button(win, text="GO!", width=6, command=click, bg='tan2').grid(row=5, column=1)
Label(win, text="\nOR ENTER A PROJECT GUTENBERG LINK HERE!", bg='beige', font='Modern 18').grid(row=7, column=1)
# second entry box for link
txt2 = Entry(win, width=60, bg="white")
txt2.grid(row=8, column=1)
Label(win, text=" ", bg='beige').grid(row=9, column=1)
# second GO (submit) button for link
Button(win, text="GO!", width=6, command=click, bg='tan2').grid(row=10, column=1)
Label(win, text="\n" + "." * 180 + "\n", bg='beige').grid(row=11, column=1)
# Output box
Label(win, text="Results will show up here: (scroll down for more)", bg='beige', font='Modern 18').grid(row=12,
column=1)
output = Text(win, width=71, height=6, wrap=WORD, background="tan4", fg="white")
output.grid(row=12, column=1)
print(type(output))
win.mainloop()