Merge pull request #330 from TogetherCrew/feat/329-hivemind-include-url

Feat: Hivemind include url for documents
TogetherCrew · Nov 28, 2024 · 8ae632f · 8ae632f
2 parents c6db4c1 + 7d2191e
commit 8ae632f
Show file tree

Hide file tree

Showing 17 changed files with 273 additions and 38 deletions.
diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py
@@ -1,6 +1,8 @@
+import copy
 import logging
 
 from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor
+from llama_index.core import Document
 from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline
 
 
@@ -56,7 +58,8 @@ def process(
         documents = self.notion_extractor.extract(
             page_ids=page_ids, database_ids=database_ids
         )
-        self.ingestion_pipeline.run_pipeline(docs=documents)
+        transformed_docs = self._transform_documents(documents=documents)
+        self.ingestion_pipeline.run_pipeline(docs=transformed_docs)
 
     def process_page(self, page_id: str) -> None:
         """
@@ -71,7 +74,8 @@ def process_page(self, page_id: str) -> None:
             f"Processing page_id: {page_id}, of community id: {self.community_id}"
         )
         documents = self.notion_extractor.extract_from_pages(page_ids=[page_id])
-        self.ingestion_pipeline.run_pipeline(docs=documents)
+        transformed_docs = self._transform_documents(documents=documents)
+        self.ingestion_pipeline.run_pipeline(docs=transformed_docs)
 
     def process_database(self, database_id: str) -> None:
         """
@@ -86,4 +90,31 @@ def process_database(self, database_id: str) -> None:
             f"Processing database id: {database_id}, of community id: {self.community_id}"
         )
         documents = self.notion_extractor.extract_from_database(database_id=database_id)
-        self.ingestion_pipeline.run_pipeline(docs=documents)
+        transformed_docs = self._transform_documents(documents=documents)
+        self.ingestion_pipeline.run_pipeline(docs=transformed_docs)
+
+    def _transform_documents(self, documents: list[Document]) -> list[Document]:
+        """
+        transform notion extracted documents by inserting their metadata a url
+
+        Parameters
+        ------------
+        documents : list[Document]
+            a list of notion extracted pages
+
+        Returns
+        ---------
+        documents : list[Document]
+            a list of documents each inlcuded with url in its metadata
+        """
+        # Copying
+        transformed_docs: list[Document] = copy.deepcopy(documents)
+
+        for doc in transformed_docs:
+            page_id: str | None = doc.metadata.get("page_id")
+            if page_id is None:
+                doc.metadata["url"] = None
+            else:
+                doc.metadata["url"] = f"https://www.notion.so/{page_id}"
+
+        return transformed_docs
diff --git a/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py b/dags/hivemind_etl_helpers/src/db/discord/utils/transform_discord_raw_messges.py
@@ -95,6 +95,10 @@ def prepare_document(
     reactions = message["reactions"]
     raw_content = message["content"]
 
+    message_id = message["messageId"]
+    channel_id = message["channelId"]
+    thread_id = message["threadId"]
+
     reaction_ids = prepare_raction_ids(reactions)
 
     mention_names: list[str]
@@ -161,10 +165,16 @@ def prepare_document(
     # always has length 1
     assert len(author_name) == 1, "Either None or multiple authors!"
 
+    if thread_id is None:
+        url = f"https://discord.com/channels/{guild_id}/{channel_id}/{message_id}"
+    else:
+        url = f"https://discord.com/channels/{guild_id}/{thread_id}/{message_id}"
+
     msg_meta_data = {
         "channel": message["channelName"],
         "date": message["createdDate"].strftime(DATE_FORMAT),
         "author_username": author_name[0],
+        "url": url,
         # always including the thread_name, if `None`, then it was a channel message
         "thread": message["threadName"],
     }
@@ -234,6 +244,7 @@ def prepare_document(
             "replier_global_name",
             "replier_nickname",
             "role_mentions",
+            "url",
         ]
         doc.excluded_llm_metadata_keys = [
             "author_nickname",
@@ -250,6 +261,7 @@ def prepare_document(
             "replier_global_name",
             "replier_nickname",
             "role_mentions",
+            "url",
         ]
     else:
         doc = Document(text=content_url_updated)

diff --git a/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py b/dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py
@@ -55,8 +55,10 @@ def fetch_raw_posts(
             author.username AS author_username,
             author.name AS author_name,
             t.title AS topic,
+            t.id AS topic_id,
             p.id AS postId,
             $forum_endpoint AS forum_endpoint,
+            p.postNumber as post_number,
             p.raw AS raw,
             p.createdAt AS createdAt,
             p.updatedAt AS updatedAt,

diff --git a/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py b/dags/hivemind_etl_helpers/src/db/discourse/utils/transform_raw_to_documents.py
@@ -33,12 +33,18 @@ def transform_raw_to_documents(
         doc: Document
 
         if not exclude_metadata:
+            forum_endpoint = post["forum_endpoint"]
+            topic_id = post["topic_id"]
+            post_number = post["post_number"]
+
+            link = f"https://{forum_endpoint}/t/{topic_id}/{post_number}"
+
             doc = Document(
                 text=post["raw"],
                 metadata={
                     "author_name": post["author_name"],
                     "author_username": post["author_username"],
-                    "forum_endpoint": post["forum_endpoint"],
+                    "forum_endpoint": forum_endpoint,
                     "createdAt": post["createdAt"],
                     "updatedAt": post["updatedAt"],
                     "postId": post["postId"],
@@ -49,6 +55,7 @@ def transform_raw_to_documents(
                     "liker_names": post["liker_names"],
                     "replier_usernames": post["replier_usernames"],
                     "replier_names": post["replier_names"],
+                    "link": link,
                 },
             )
         else:

diff --git a/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py b/dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 import os
 from typing import List, Optional
@@ -49,7 +50,9 @@ def load_data(
             documents.extend(self._load_from_files(file_ids))
         if not documents:
             raise ValueError("One input at least must be given!")
-        return documents
+
+        transformed_documents = self._transform_google_documents(documents)
+        return transformed_documents
 
     def _load_from_folders(self, folder_ids: List[str]):
         folders_data = []
@@ -108,3 +111,19 @@ def _load_google_drive_creds(self) -> tuple[str, str]:
             raise ValueError("`GOOGLE_CLIENT_SECRET` not found from env variables!")
 
         return client_id, client_secret
+
+    def _transform_google_documents(self, documents: list[Document]) -> list[Document]:
+        """
+        transform google extracted documents by inserting their metadata a url
+        """
+        # copying
+        transformed_docs: list[Document] = copy.deepcopy(documents)
+
+        for doc in transformed_docs:
+            file_id: str | None = doc.metadata.get("file id")
+            if file_id is None:
+                doc.metadata["url"] = None
+            else:
+                doc.metadata["url"] = f"https://drive.google.com/file/d/{file_id}/view"
+
+        return transformed_docs
diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/commit.py b/dags/hivemind_etl_helpers/src/db/github/extract/commit.py
@@ -78,7 +78,7 @@ def _fetch_raw_commits(
                 user_commiter.login AS committer_name,
                 co.`commit.message` AS message,
                 co.`commit.url` AS api_url,
-                co.`parents.0.html_url` AS html_url,
+                co.`parents.0.html_url` AS url,
                 co.repository_id AS repository_id,
                 repo.full_name AS repository_name,
                 co.sha AS sha,

diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/commit.py b/dags/hivemind_etl_helpers/src/db/github/schema/commit.py
@@ -8,7 +8,7 @@ def __init__(
         committer_name: str,
         message: str,
         api_url: str,
-        html_url: str,
+        url: str,
         repository_id: int,
         repository_name: str,
         sha: str,
@@ -26,7 +26,7 @@ def __init__(
         self.committer_name = committer_name
         self.message = message
         self.api_url = api_url
-        self.html_url = html_url
+        self.url = url
         self.repository_id = repository_id
         self.repository_name = repository_name
         self.sha = sha
@@ -42,7 +42,7 @@ def from_dict(cls, data: dict[str, str | int | None]) -> "GitHubCommit":
             committer_name=data["committer_name"],  # type: ignore
             message=data["message"],  # type: ignore
             api_url=data["api_url"],  # type: ignore
-            html_url=data["html_url"],  # type: ignore
+            url=data["url"],  # type: ignore
             repository_id=data["repository_id"],  # type: ignore
             repository_name=data["repository_name"],  # type: ignore
             sha=data["sha"],  # type: ignore
@@ -58,7 +58,7 @@ def to_dict(self) -> dict[str, str | int | None]:
             "committer_name": self.committer_name,
             "message": self.message,
             "api_url": self.api_url,
-            "html_url": self.html_url,
+            "url": self.url,
             "repository_id": self.repository_id,
             "repository_name": self.repository_name,
             "sha": self.sha,

diff --git a/dags/hivemind_etl_helpers/src/db/github/transform/commits.py b/dags/hivemind_etl_helpers/src/db/github/transform/commits.py
@@ -27,7 +27,7 @@ def transform_commits(data: list[GitHubCommit]) -> list[Document]:
             metadata=metadata,
             # all metadata to be excluded from embedding model
             excluded_embed_metadata_keys=list(metadata.keys()),
-            excluded_llm_metadata_keys=["sha", "api_url", "html_url", "verification"],
+            excluded_llm_metadata_keys=["sha", "api_url", "url", "verification"],
         )
         transformed_commits.append(document)
 

diff --git a/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py b/dags/hivemind_etl_helpers/src/db/telegram/transform/messages.py
@@ -23,6 +23,9 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
         """
         transformed_docs: list[Document] = []
 
+        # within links the "-100" of chat_id is removed
+        chat_id = str(self.chat_id).removeprefix("-100")
+
         for message in messages:
             document = Document(
                 text=message.message_text,
@@ -35,6 +38,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
                     "replies": message.repliers,
                     "reactors": message.reactors,
                     "chat_name": self.chat_name,
+                    "url": f"https://t.me/c/{chat_id}/{message.message_id}",
                 },
                 excluded_embed_metadata_keys=[
                     "author",
@@ -44,6 +48,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
                     "replies",
                     "reactors",
                     "chat_name",
+                    "url",
                 ],
                 excluded_llm_metadata_keys=[
                     "createdAt",
@@ -52,6 +57,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
                     "replies",
                     "reactors",
                     "chat_name",
+                    "url",
                 ],
             )
             transformed_docs.append(document)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_document_from_db.py
@@ -105,7 +105,7 @@ def test_transform_two_data(self):
             "reactions": [],
             "replied_user": None,
             "createdDate": datetime(2023, 5, 1),
-            "messageId": str(np.random.randint(1000000, 9999999)),
+            "messageId": "10000000000",
             "channelId": channels[0],
             "channelName": "channel1",
             "threadId": None,
@@ -123,7 +123,7 @@ def test_transform_two_data(self):
             "reactions": [],
             "replied_user": "114",
             "createdDate": datetime(2023, 5, 2),
-            "messageId": str(np.random.randint(1000000, 9999999)),
+            "messageId": "10000000001",
             "channelId": channels[1],
             "channelName": "channel2",
             "threadId": None,
@@ -141,7 +141,7 @@ def test_transform_two_data(self):
             "reactions": [],
             "replied_user": "114",
             "createdDate": datetime(2023, 5, 2),
-            "messageId": str(np.random.randint(1000000, 9999999)),
+            "messageId": "10000000002",
             "channelId": channels[1],
             "channelName": "channel2",
             "threadId": "88888",
@@ -159,7 +159,7 @@ def test_transform_two_data(self):
             "reactions": [],
             "replied_user": None,
             "createdDate": datetime(2023, 5, 8),
-            "messageId": str(np.random.randint(1000000, 9999999)),
+            "messageId": "10000000003",
             "channelId": channels[0],
             "channelName": "channel1",
             "threadId": None,
@@ -179,7 +179,7 @@ def test_transform_two_data(self):
             "reactions": [],
             "replied_user": None,
             "createdDate": datetime(2023, 5, 8),
-            "messageId": str(np.random.randint(1000000, 9999999)),
+            "messageId": "10000000004",
             "channelId": "734738382",
             "channelName": "channel1",
             "threadId": None,
@@ -271,6 +271,7 @@ def test_transform_two_data(self):
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
             "thread": None,
+            "url": "https://discord.com/channels/1234/111111/10000000000",
         }
 
         expected_metadata_1 = {
@@ -284,6 +285,7 @@ def test_transform_two_data(self):
             "replier_username": "user4",
             "replier_global_name": "user4_GlobalName",
             "thread": None,
+            "url": "https://discord.com/channels/1234/22222/10000000001",
         }
 
         expected_metadata_2 = {
@@ -298,6 +300,7 @@ def test_transform_two_data(self):
             "replier_global_name": "user4_GlobalName",
             "thread": "example_thread1",
             "role_mentions": ["role1"],
+            "url": "https://discord.com/channels/1234/88888/10000000002",
         }
 
         expected_metadata_3 = {
@@ -307,6 +310,7 @@ def test_transform_two_data(self):
             "author_global_name": "user1_GlobalName",
             "url_reference": {"[URL0]": "https://www.google.com"},
             "thread": None,
+            "url": "https://discord.com/channels/1234/111111/10000000003",
         }
         print(documents[0].text)
         self.assertDictEqual(documents[0].metadata, expected_metadata_0)

diff --git a/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py b/dags/hivemind_etl_helpers/tests/integration/test_discord_prepare_llama.py
@@ -168,6 +168,7 @@ def test_transform_two_data(self):
             "author_username": "user1",
             "author_global_name": "user1_GlobalName",
             "author_nickname": "user1_nickname",
+            "url": f"https://discord.com/channels/{guild_id}/1313130/1111111110",
             "thread": None,
         }
 
@@ -180,6 +181,7 @@ def test_transform_two_data(self):
             "mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
             "replier_username": "user4",
             "replier_global_name": "user4_GlobalName",
+            "url": f"https://discord.com/channels/{guild_id}/1313131/1111111111",
             "thread": None,
         }
 
@@ -192,6 +194,7 @@ def test_transform_two_data(self):
             "mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
             "replier_username": "user4",
             "replier_global_name": "user4_GlobalName",
+            "url": f"https://discord.com/channels/{guild_id}/88888/1111111112",
             "thread": "example_thread1",
             "role_mentions": ["role1"],
         }
@@ -203,6 +206,7 @@ def test_transform_two_data(self):
             "author_global_name": "user1_GlobalName",
             "author_nickname": "user1_nickname",
             "url_reference": {"[URL0]": "https://www.google.com"},
+            "url": f"https://discord.com/channels/{guild_id}/1313133/1111111113",
             "thread": None,
         }