-
Notifications
You must be signed in to change notification settings - Fork 0
/
Exercise 18- TF-IDF.py
104 lines (80 loc) · 2.95 KB
/
Exercise 18- TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#Beginner B B A A
"""Humpty Dumpty sat on a wall,
Humpty Dumpty had a great fall.
All the king's horses and all the king's men
Couldn't put Humpty together again."""
#Intermediate
# DATA BLOCK
text = '''he really really loves coffee
my sister dislikes coffee
my sister loves tea'''
import math
def main(text):
# split the text first into lines and then into lists of words
docs = [line.split() for line in text.splitlines()]
N = len(docs)
# create the vocabulary: the list of words that appear at least once
vocabulary = list(set(text.split()))
df = {}
tf = {}
for word in vocabulary:
# tf: number of occurrences of word w in document divided by document length
# note: tf[word] will be a list containing the tf of each word for each document
# for example tf['he'][0] contains the term frequence of the word 'he' in the first
# document
tf[word] = [doc.count(word)/len(doc) for doc in docs]
# df: number of documents containing word w
df[word] = sum([word in doc for doc in docs])/N
# loop through documents to calculate the tf-idf values
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
# ADD THE CORRECT FORMULA HERE. Remember to use the base 10 logarithm: math.log(x, 10)
tfidf.append(tf[word][doc_index] * math.log(1/df[word],10))
print(tfidf)
main(text)
#Advanced
import numpy as np
import math
text = '''Humpty Dumpty sat on a wall
Humpty Dumpty had a great fall
all the king's horses and all the king's men
couldn't put Humpty together again'''
def find_tfidf(text):
docs = [line.lower().split() for line in text.split('\n')]
vocabulary = list(set(text.lower().split()))
N = len(docs)
tf = {}
df = {}
for word in vocabulary:
tf[word] = [doc.count(word)/len(doc) for doc in docs]
df[word] = sum([word in doc for doc in docs])/N
data = []
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
# if df[word] == 0:
# df[word] = 1
tfidf.append(tf[word][doc_index] * math.log(1/df[word],10))
data.append(tfidf)
# print("data:\n",data)
return data
def distance(row1, row2):
total = 0
for i in range(len(row1)):
total = total + abs(row1[i] - row2[i])
return np.sqrt(total)
def find_nearest_pair(data):
n = len(data)
dist = np.empty((n, n), dtype=float)
# dist = np.array([[distance(sent1, sent2) for sent1 in data] for sent2 in data])
for i in range(n):
for j in range(n):
if i == j:
dist[i][j] = np.inf # assign the value np.inf (the maximum possible floating point value)
else:
dist[i][j] = distance(data[i],data[j])
nearest_index = np.unravel_index(np.argmin(dist),(n,n))
print(nearest_index)
return nearest_index
find_nearest_pair(find_tfidf(text))