-
Notifications
You must be signed in to change notification settings - Fork 1
/
embeddingFuncs.py
100 lines (87 loc) · 3.75 KB
/
embeddingFuncs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def embedArticle(genai, embeddings: list, embedding_model: str, article):
"""
Embed an article using a google gemini embedding model and append it to the embeddings array
@param genai: The google gemini variable
@param embeddings: The array of embeddings that we will append to
@param embedding_model: As a string, what embedding model to use
@param article: The article, including its id, link, date, etc (not just the page content)
@return: None
"""
content = article['content']['rendered']
article_id = str(article['id'])
# Generate embedding using Google Gemini
embedding_response = genai.embed_content(
model=embedding_model,
content=content)
embedding_vector = embedding_response['embedding']
# The new embedding that will be added to the embeddings array
new_embedding = {
"id": article_id,
"values": embedding_vector,
"metadata": {
"date": article['date'],
"date_gmt": article['date_gmt'],
"link": article['link']
}
}
# Append the new embedding
embeddings.append(new_embedding)
def embedChunksAsArticle(genai, embeddings: list, embedding_model: str, article, split_texts: list):
"""
Embed chunks using a google gemini embedding model and append it to the embeddings array
@param genai: The google gemini variable
@param embeddings: The array of embeddings that we will append to
@param embedding_model: As a string, what embedding model to use
@param article: The article JSON, including its id, link, date, etc (not just the page content)
@param split_texts: An array holding the split version of article (only the rendered page texts, not info like date or link)
@return: A bool - False if there was an error embedding, True if successful
"""
new_embeddings = []
# Try embedding each text
for i, split_text in enumerate(split_texts):
try:
# Generate embedding using Google Gemini
embedding_response = genai.embed_content(
model=embedding_model,
content=split_text)
embedding_vector = embedding_response['embedding']
# Append to embeddings list
# The id for this is a bit different, with a chunk number added to the end (to preserve unique ids)
new_embeddings.append({
"id": f"{str(article['id'])}_chunk{i}",
"values": embedding_vector,
"metadata": {
"date": article['date'],
"date_gmt": article['date_gmt'],
"link": article['link'],
"chunk_total": len(split_texts)
}
})
# If there was an error embedding
except Exception as e:
print(e)
# Return false indicating error (embeddings remains unchanged)
return False
# If everything was successful, append to the embeddings array
embeddings.extend(new_embeddings)
# Return True indicating success
return True
def generateQueryEmbedding(genai, embedding_model: str, query: str):
"""
Convert a query to an embedding using google gemini
@param genai: The google gemini variable
@param embedding_model: As a string, what embedding model to use
@param query: The query string
"""
# Convert the query to an embedding
try:
embedding = genai.embed_content(
model=embedding_model,
content=query
)
# Return the query embedding
return embedding["embedding"]
# If an error, return None
except Exception as e:
print(f"Error generating query embedding: {e}")
return None