Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Hivemind include url for documents #330

Merged
merged 15 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions dags/hivemind_etl_helpers/notion_etl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import copy
import logging

from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor
from llama_index.core import Document
from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline


Expand Down Expand Up @@ -56,7 +58,8 @@ def process(
documents = self.notion_extractor.extract(
page_ids=page_ids, database_ids=database_ids
)
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def process_page(self, page_id: str) -> None:
"""
Expand All @@ -71,7 +74,8 @@ def process_page(self, page_id: str) -> None:
f"Processing page_id: {page_id}, of community id: {self.community_id}"
)
documents = self.notion_extractor.extract_from_pages(page_ids=[page_id])
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def process_database(self, database_id: str) -> None:
"""
Expand All @@ -86,4 +90,31 @@ def process_database(self, database_id: str) -> None:
f"Processing database id: {database_id}, of community id: {self.community_id}"
)
documents = self.notion_extractor.extract_from_database(database_id=database_id)
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def _transform_documents(self, documents: list[Document]) -> list[Document]:
"""
transform notion extracted documents by inserting their metadata a url

Parameters
------------
documents : list[Document]
a list of notion extracted pages

Returns
---------
documents : list[Document]
a list of documents each inlcuded with url in its metadata
"""
# Copying
transformed_docs: list[Document] = copy.deepcopy(documents)

for doc in transformed_docs:
page_id: str | None = doc.metadata.get("page_id")
if page_id is None:
doc.metadata["url"] = None
else:
doc.metadata["url"] = f"https://www.notion.so/{page_id}"

return transformed_docs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def prepare_document(
reactions = message["reactions"]
raw_content = message["content"]

message_id = message["messageId"]
channel_id = message["channelId"]
thread_id = message["threadId"]

reaction_ids = prepare_raction_ids(reactions)

mention_names: list[str]
Expand Down Expand Up @@ -161,10 +165,16 @@ def prepare_document(
# always has length 1
assert len(author_name) == 1, "Either None or multiple authors!"

if thread_id is None:
url = f"https://discord.com/channels/{guild_id}/{channel_id}/{message_id}"
else:
url = f"https://discord.com/channels/{guild_id}/{thread_id}/{message_id}"

amindadgar marked this conversation as resolved.
Show resolved Hide resolved
msg_meta_data = {
"channel": message["channelName"],
"date": message["createdDate"].strftime(DATE_FORMAT),
"author_username": author_name[0],
"url": url,
# always including the thread_name, if `None`, then it was a channel message
"thread": message["threadName"],
}
Expand Down Expand Up @@ -234,6 +244,7 @@ def prepare_document(
"replier_global_name",
"replier_nickname",
"role_mentions",
"url",
]
doc.excluded_llm_metadata_keys = [
"author_nickname",
Expand All @@ -250,6 +261,7 @@ def prepare_document(
"replier_global_name",
"replier_nickname",
"role_mentions",
"url",
]
else:
doc = Document(text=content_url_updated)
Expand Down
2 changes: 2 additions & 0 deletions dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ def fetch_raw_posts(
author.username AS author_username,
author.name AS author_name,
t.title AS topic,
t.id AS topic_id,
p.id AS postId,
$forum_endpoint AS forum_endpoint,
p.postNumber as post_number,
p.raw AS raw,
p.createdAt AS createdAt,
p.updatedAt AS updatedAt,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,18 @@ def transform_raw_to_documents(
doc: Document

if not exclude_metadata:
forum_endpoint = post["forum_endpoint"]
topic_id = post["topic_id"]
post_number = post["post_number"]

link = f"https://{forum_endpoint}/t/{topic_id}/{post_number}"
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

doc = Document(
text=post["raw"],
metadata={
"author_name": post["author_name"],
"author_username": post["author_username"],
"forum_endpoint": post["forum_endpoint"],
"forum_endpoint": forum_endpoint,
"createdAt": post["createdAt"],
"updatedAt": post["updatedAt"],
"postId": post["postId"],
Expand All @@ -49,6 +55,7 @@ def transform_raw_to_documents(
"liker_names": post["liker_names"],
"replier_usernames": post["replier_usernames"],
"replier_names": post["replier_names"],
"link": link,
},
)
else:
Expand Down
21 changes: 20 additions & 1 deletion dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
import os
from typing import List, Optional
Expand Down Expand Up @@ -49,7 +50,9 @@ def load_data(
documents.extend(self._load_from_files(file_ids))
if not documents:
raise ValueError("One input at least must be given!")
return documents

transformed_documents = self._transform_google_documents(documents)
return transformed_documents

def _load_from_folders(self, folder_ids: List[str]):
folders_data = []
Expand Down Expand Up @@ -108,3 +111,19 @@ def _load_google_drive_creds(self) -> tuple[str, str]:
raise ValueError("`GOOGLE_CLIENT_SECRET` not found from env variables!")

return client_id, client_secret

def _transform_google_documents(self, documents: list[Document]) -> list[Document]:
"""
transform google extracted documents by inserting their metadata a url
"""
# copying
transformed_docs: list[Document] = copy.deepcopy(documents)

for doc in transformed_docs:
file_id: str | None = doc.metadata.get("file id")
if file_id is None:
doc.metadata["url"] = None
else:
doc.metadata["url"] = f"https://drive.google.com/file/d/{file_id}/view"

return transformed_docs
2 changes: 1 addition & 1 deletion dags/hivemind_etl_helpers/src/db/github/extract/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _fetch_raw_commits(
user_commiter.login AS committer_name,
co.`commit.message` AS message,
co.`commit.url` AS api_url,
co.`parents.0.html_url` AS html_url,
co.`parents.0.html_url` AS url,
co.repository_id AS repository_id,
repo.full_name AS repository_name,
co.sha AS sha,
Expand Down
8 changes: 4 additions & 4 deletions dags/hivemind_etl_helpers/src/db/github/schema/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(
committer_name: str,
message: str,
api_url: str,
html_url: str,
url: str,
repository_id: int,
repository_name: str,
sha: str,
Expand All @@ -26,7 +26,7 @@ def __init__(
self.committer_name = committer_name
self.message = message
self.api_url = api_url
self.html_url = html_url
self.url = url
self.repository_id = repository_id
self.repository_name = repository_name
self.sha = sha
Expand All @@ -42,7 +42,7 @@ def from_dict(cls, data: dict[str, str | int | None]) -> "GitHubCommit":
committer_name=data["committer_name"], # type: ignore
message=data["message"], # type: ignore
api_url=data["api_url"], # type: ignore
html_url=data["html_url"], # type: ignore
url=data["url"], # type: ignore
repository_id=data["repository_id"], # type: ignore
repository_name=data["repository_name"], # type: ignore
sha=data["sha"], # type: ignore
Expand All @@ -58,7 +58,7 @@ def to_dict(self) -> dict[str, str | int | None]:
"committer_name": self.committer_name,
"message": self.message,
"api_url": self.api_url,
"html_url": self.html_url,
"url": self.url,
"repository_id": self.repository_id,
"repository_name": self.repository_name,
"sha": self.sha,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def transform_commits(data: list[GitHubCommit]) -> list[Document]:
metadata=metadata,
# all metadata to be excluded from embedding model
excluded_embed_metadata_keys=list(metadata.keys()),
excluded_llm_metadata_keys=["sha", "api_url", "html_url", "verification"],
excluded_llm_metadata_keys=["sha", "api_url", "url", "verification"],
)
transformed_commits.append(document)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"""
transformed_docs: list[Document] = []

# within links the "-100" of chat_id is removed
chat_id = str(self.chat_id).removeprefix("-100")

for message in messages:
document = Document(
text=message.message_text,
Expand All @@ -35,6 +38,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies": message.repliers,
"reactors": message.reactors,
"chat_name": self.chat_name,
"url": f"https://t.me/c/{chat_id}/{message.message_id}",
},
excluded_embed_metadata_keys=[
"author",
Expand All @@ -44,6 +48,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies",
"reactors",
"chat_name",
"url",
],
excluded_llm_metadata_keys=[
"createdAt",
Expand All @@ -52,6 +57,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies",
"reactors",
"chat_name",
"url",
],
)
transformed_docs.append(document)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000000",
"channelId": channels[0],
"channelName": "channel1",
"threadId": None,
Expand All @@ -123,7 +123,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": "114",
"createdDate": datetime(2023, 5, 2),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000001",
"channelId": channels[1],
"channelName": "channel2",
"threadId": None,
Expand All @@ -141,7 +141,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": "114",
"createdDate": datetime(2023, 5, 2),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000002",
"channelId": channels[1],
"channelName": "channel2",
"threadId": "88888",
Expand All @@ -159,7 +159,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 8),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000003",
"channelId": channels[0],
"channelName": "channel1",
"threadId": None,
Expand All @@ -179,7 +179,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 8),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000004",
"channelId": "734738382",
"channelName": "channel1",
"threadId": None,
Expand Down Expand Up @@ -271,6 +271,7 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"thread": None,
"url": "https://discord.com/channels/1234/111111/10000000000",
}

expected_metadata_1 = {
Expand All @@ -284,6 +285,7 @@ def test_transform_two_data(self):
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"thread": None,
"url": "https://discord.com/channels/1234/22222/10000000001",
}

expected_metadata_2 = {
Expand All @@ -298,6 +300,7 @@ def test_transform_two_data(self):
"replier_global_name": "user4_GlobalName",
"thread": "example_thread1",
"role_mentions": ["role1"],
"url": "https://discord.com/channels/1234/88888/10000000002",
}

expected_metadata_3 = {
Expand All @@ -307,6 +310,7 @@ def test_transform_two_data(self):
"author_global_name": "user1_GlobalName",
"url_reference": {"[URL0]": "https://www.google.com"},
"thread": None,
"url": "https://discord.com/channels/1234/111111/10000000003",
}
print(documents[0].text)
self.assertDictEqual(documents[0].metadata, expected_metadata_0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"url": f"https://discord.com/channels/{guild_id}/1313130/1111111110",
"thread": None,
}

Expand All @@ -180,6 +181,7 @@ def test_transform_two_data(self):
"mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"url": f"https://discord.com/channels/{guild_id}/1313131/1111111111",
"thread": None,
}

Expand All @@ -192,6 +194,7 @@ def test_transform_two_data(self):
"mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"url": f"https://discord.com/channels/{guild_id}/88888/1111111112",
"thread": "example_thread1",
"role_mentions": ["role1"],
}
Expand All @@ -203,6 +206,7 @@ def test_transform_two_data(self):
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"url_reference": {"[URL0]": "https://www.google.com"},
"url": f"https://discord.com/channels/{guild_id}/1313133/1111111113",
"thread": None,
}

Expand Down
Loading