-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
28 lines (22 loc) · 829 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
def extract_nouns(query):
# Download necessary NLTK data files
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
# Define stopwords and tokenize the query
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(query)
# POS tagging
tagged_words = pos_tag(word_tokens)
# Extract nouns: remove stopwords and non-alphabetic tokens, and filter for nouns
nouns = [
word
for word, pos in tagged_words
if word.isalpha() and word.lower() not in stop_words and pos.startswith("NN")
]
# Return nouns as a comma-separated string
return nouns