Skip to content

Commit

Permalink
Merge pull request #330 from TogetherCrew/feat/329-hivemind-include-url
Browse files Browse the repository at this point in the history
Feat: Hivemind include url for documents
  • Loading branch information
amindadgar authored Nov 28, 2024
2 parents c6db4c1 + 7d2191e commit 8ae632f
Show file tree
Hide file tree
Showing 17 changed files with 273 additions and 38 deletions.
37 changes: 34 additions & 3 deletions dags/hivemind_etl_helpers/notion_etl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import copy
import logging

from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor
from llama_index.core import Document
from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline


Expand Down Expand Up @@ -56,7 +58,8 @@ def process(
documents = self.notion_extractor.extract(
page_ids=page_ids, database_ids=database_ids
)
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def process_page(self, page_id: str) -> None:
"""
Expand All @@ -71,7 +74,8 @@ def process_page(self, page_id: str) -> None:
f"Processing page_id: {page_id}, of community id: {self.community_id}"
)
documents = self.notion_extractor.extract_from_pages(page_ids=[page_id])
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def process_database(self, database_id: str) -> None:
"""
Expand All @@ -86,4 +90,31 @@ def process_database(self, database_id: str) -> None:
f"Processing database id: {database_id}, of community id: {self.community_id}"
)
documents = self.notion_extractor.extract_from_database(database_id=database_id)
self.ingestion_pipeline.run_pipeline(docs=documents)
transformed_docs = self._transform_documents(documents=documents)
self.ingestion_pipeline.run_pipeline(docs=transformed_docs)

def _transform_documents(self, documents: list[Document]) -> list[Document]:
"""
transform notion extracted documents by inserting their metadata a url
Parameters
------------
documents : list[Document]
a list of notion extracted pages
Returns
---------
documents : list[Document]
a list of documents each inlcuded with url in its metadata
"""
# Copying
transformed_docs: list[Document] = copy.deepcopy(documents)

for doc in transformed_docs:
page_id: str | None = doc.metadata.get("page_id")
if page_id is None:
doc.metadata["url"] = None
else:
doc.metadata["url"] = f"https://www.notion.so/{page_id}"

return transformed_docs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def prepare_document(
reactions = message["reactions"]
raw_content = message["content"]

message_id = message["messageId"]
channel_id = message["channelId"]
thread_id = message["threadId"]

reaction_ids = prepare_raction_ids(reactions)

mention_names: list[str]
Expand Down Expand Up @@ -161,10 +165,16 @@ def prepare_document(
# always has length 1
assert len(author_name) == 1, "Either None or multiple authors!"

if thread_id is None:
url = f"https://discord.com/channels/{guild_id}/{channel_id}/{message_id}"
else:
url = f"https://discord.com/channels/{guild_id}/{thread_id}/{message_id}"

msg_meta_data = {
"channel": message["channelName"],
"date": message["createdDate"].strftime(DATE_FORMAT),
"author_username": author_name[0],
"url": url,
# always including the thread_name, if `None`, then it was a channel message
"thread": message["threadName"],
}
Expand Down Expand Up @@ -234,6 +244,7 @@ def prepare_document(
"replier_global_name",
"replier_nickname",
"role_mentions",
"url",
]
doc.excluded_llm_metadata_keys = [
"author_nickname",
Expand All @@ -250,6 +261,7 @@ def prepare_document(
"replier_global_name",
"replier_nickname",
"role_mentions",
"url",
]
else:
doc = Document(text=content_url_updated)
Expand Down
2 changes: 2 additions & 0 deletions dags/hivemind_etl_helpers/src/db/discourse/fetch_raw_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ def fetch_raw_posts(
author.username AS author_username,
author.name AS author_name,
t.title AS topic,
t.id AS topic_id,
p.id AS postId,
$forum_endpoint AS forum_endpoint,
p.postNumber as post_number,
p.raw AS raw,
p.createdAt AS createdAt,
p.updatedAt AS updatedAt,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,18 @@ def transform_raw_to_documents(
doc: Document

if not exclude_metadata:
forum_endpoint = post["forum_endpoint"]
topic_id = post["topic_id"]
post_number = post["post_number"]

link = f"https://{forum_endpoint}/t/{topic_id}/{post_number}"

doc = Document(
text=post["raw"],
metadata={
"author_name": post["author_name"],
"author_username": post["author_username"],
"forum_endpoint": post["forum_endpoint"],
"forum_endpoint": forum_endpoint,
"createdAt": post["createdAt"],
"updatedAt": post["updatedAt"],
"postId": post["postId"],
Expand All @@ -49,6 +55,7 @@ def transform_raw_to_documents(
"liker_names": post["liker_names"],
"replier_usernames": post["replier_usernames"],
"replier_names": post["replier_names"],
"link": link,
},
)
else:
Expand Down
21 changes: 20 additions & 1 deletion dags/hivemind_etl_helpers/src/db/gdrive/gdrive_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
import os
from typing import List, Optional
Expand Down Expand Up @@ -49,7 +50,9 @@ def load_data(
documents.extend(self._load_from_files(file_ids))
if not documents:
raise ValueError("One input at least must be given!")
return documents

transformed_documents = self._transform_google_documents(documents)
return transformed_documents

def _load_from_folders(self, folder_ids: List[str]):
folders_data = []
Expand Down Expand Up @@ -108,3 +111,19 @@ def _load_google_drive_creds(self) -> tuple[str, str]:
raise ValueError("`GOOGLE_CLIENT_SECRET` not found from env variables!")

return client_id, client_secret

def _transform_google_documents(self, documents: list[Document]) -> list[Document]:
"""
transform google extracted documents by inserting their metadata a url
"""
# copying
transformed_docs: list[Document] = copy.deepcopy(documents)

for doc in transformed_docs:
file_id: str | None = doc.metadata.get("file id")
if file_id is None:
doc.metadata["url"] = None
else:
doc.metadata["url"] = f"https://drive.google.com/file/d/{file_id}/view"

return transformed_docs
2 changes: 1 addition & 1 deletion dags/hivemind_etl_helpers/src/db/github/extract/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _fetch_raw_commits(
user_commiter.login AS committer_name,
co.`commit.message` AS message,
co.`commit.url` AS api_url,
co.`parents.0.html_url` AS html_url,
co.`parents.0.html_url` AS url,
co.repository_id AS repository_id,
repo.full_name AS repository_name,
co.sha AS sha,
Expand Down
8 changes: 4 additions & 4 deletions dags/hivemind_etl_helpers/src/db/github/schema/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(
committer_name: str,
message: str,
api_url: str,
html_url: str,
url: str,
repository_id: int,
repository_name: str,
sha: str,
Expand All @@ -26,7 +26,7 @@ def __init__(
self.committer_name = committer_name
self.message = message
self.api_url = api_url
self.html_url = html_url
self.url = url
self.repository_id = repository_id
self.repository_name = repository_name
self.sha = sha
Expand All @@ -42,7 +42,7 @@ def from_dict(cls, data: dict[str, str | int | None]) -> "GitHubCommit":
committer_name=data["committer_name"], # type: ignore
message=data["message"], # type: ignore
api_url=data["api_url"], # type: ignore
html_url=data["html_url"], # type: ignore
url=data["url"], # type: ignore
repository_id=data["repository_id"], # type: ignore
repository_name=data["repository_name"], # type: ignore
sha=data["sha"], # type: ignore
Expand All @@ -58,7 +58,7 @@ def to_dict(self) -> dict[str, str | int | None]:
"committer_name": self.committer_name,
"message": self.message,
"api_url": self.api_url,
"html_url": self.html_url,
"url": self.url,
"repository_id": self.repository_id,
"repository_name": self.repository_name,
"sha": self.sha,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def transform_commits(data: list[GitHubCommit]) -> list[Document]:
metadata=metadata,
# all metadata to be excluded from embedding model
excluded_embed_metadata_keys=list(metadata.keys()),
excluded_llm_metadata_keys=["sha", "api_url", "html_url", "verification"],
excluded_llm_metadata_keys=["sha", "api_url", "url", "verification"],
)
transformed_commits.append(document)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"""
transformed_docs: list[Document] = []

# within links the "-100" of chat_id is removed
chat_id = str(self.chat_id).removeprefix("-100")

for message in messages:
document = Document(
text=message.message_text,
Expand All @@ -35,6 +38,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies": message.repliers,
"reactors": message.reactors,
"chat_name": self.chat_name,
"url": f"https://t.me/c/{chat_id}/{message.message_id}",
},
excluded_embed_metadata_keys=[
"author",
Expand All @@ -44,6 +48,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies",
"reactors",
"chat_name",
"url",
],
excluded_llm_metadata_keys=[
"createdAt",
Expand All @@ -52,6 +57,7 @@ def transform(self, messages: list[TelegramMessagesModel]) -> list[Document]:
"replies",
"reactors",
"chat_name",
"url",
],
)
transformed_docs.append(document)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 1),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000000",
"channelId": channels[0],
"channelName": "channel1",
"threadId": None,
Expand All @@ -123,7 +123,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": "114",
"createdDate": datetime(2023, 5, 2),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000001",
"channelId": channels[1],
"channelName": "channel2",
"threadId": None,
Expand All @@ -141,7 +141,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": "114",
"createdDate": datetime(2023, 5, 2),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000002",
"channelId": channels[1],
"channelName": "channel2",
"threadId": "88888",
Expand All @@ -159,7 +159,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 8),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000003",
"channelId": channels[0],
"channelName": "channel1",
"threadId": None,
Expand All @@ -179,7 +179,7 @@ def test_transform_two_data(self):
"reactions": [],
"replied_user": None,
"createdDate": datetime(2023, 5, 8),
"messageId": str(np.random.randint(1000000, 9999999)),
"messageId": "10000000004",
"channelId": "734738382",
"channelName": "channel1",
"threadId": None,
Expand Down Expand Up @@ -271,6 +271,7 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"thread": None,
"url": "https://discord.com/channels/1234/111111/10000000000",
}

expected_metadata_1 = {
Expand All @@ -284,6 +285,7 @@ def test_transform_two_data(self):
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"thread": None,
"url": "https://discord.com/channels/1234/22222/10000000001",
}

expected_metadata_2 = {
Expand All @@ -298,6 +300,7 @@ def test_transform_two_data(self):
"replier_global_name": "user4_GlobalName",
"thread": "example_thread1",
"role_mentions": ["role1"],
"url": "https://discord.com/channels/1234/88888/10000000002",
}

expected_metadata_3 = {
Expand All @@ -307,6 +310,7 @@ def test_transform_two_data(self):
"author_global_name": "user1_GlobalName",
"url_reference": {"[URL0]": "https://www.google.com"},
"thread": None,
"url": "https://discord.com/channels/1234/111111/10000000003",
}
print(documents[0].text)
self.assertDictEqual(documents[0].metadata, expected_metadata_0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def test_transform_two_data(self):
"author_username": "user1",
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"url": f"https://discord.com/channels/{guild_id}/1313130/1111111110",
"thread": None,
}

Expand All @@ -180,6 +181,7 @@ def test_transform_two_data(self):
"mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"url": f"https://discord.com/channels/{guild_id}/1313131/1111111111",
"thread": None,
}

Expand All @@ -192,6 +194,7 @@ def test_transform_two_data(self):
"mention_global_names": ["user3_GlobalName", "user4_GlobalName"],
"replier_username": "user4",
"replier_global_name": "user4_GlobalName",
"url": f"https://discord.com/channels/{guild_id}/88888/1111111112",
"thread": "example_thread1",
"role_mentions": ["role1"],
}
Expand All @@ -203,6 +206,7 @@ def test_transform_two_data(self):
"author_global_name": "user1_GlobalName",
"author_nickname": "user1_nickname",
"url_reference": {"[URL0]": "https://www.google.com"},
"url": f"https://discord.com/channels/{guild_id}/1313133/1111111113",
"thread": None,
}

Expand Down
Loading

0 comments on commit 8ae632f

Please sign in to comment.