Skip to content

Commit

Permalink
Merge pull request #314 from TogetherCrew/feat/313-telegram-summarizer
Browse files Browse the repository at this point in the history
Telegram Summarizer ETL Added!
  • Loading branch information
amindadgar authored Nov 6, 2024
2 parents a2b108e + cf1067f commit 292f14a
Show file tree
Hide file tree
Showing 9 changed files with 576 additions and 118 deletions.
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .messages import ExtractMessages
from .messages_daily import ExtractMessagesDaily
from .tc_chats import TelegramChats
6 changes: 3 additions & 3 deletions dags/hivemind_etl_helpers/src/db/telegram/extract/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def extract(self, from_date: datetime | None = None) -> list[TelegramMessagesMod
Parameters
-----------
from_date : datetime | None
load from a specific date
if not given, load all data
extract from a specific date
if not given, extract all data
Returns
---------
Expand Down Expand Up @@ -69,7 +69,7 @@ def extract(self, from_date: datetime | None = None) -> list[TelegramMessagesMod
COLLECT(DISTINCT mentioned_user.username) AS mentions,
COLLECT(DISTINCT replied_user.username) AS repliers,
COLLECT(DISTINCT reacted_user.username) AS reactors
ORDER BY message_created_at DESC
ORDER BY message_created_at ASC
"""

parameters = {"chat_id": self.chat_id}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from collections import defaultdict
from datetime import date, datetime

from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel

from . import ExtractMessages


class ExtractMessagesDaily:
def __init__(self, chat_id: str) -> None:
self.extractor = ExtractMessages(chat_id=chat_id)

def extract(
self, from_date: datetime | None = None
) -> dict[date, list[TelegramMessagesModel]]:
"""
extract messages daily
Parameters
-----------
from_date : datetime | None
extract from a specific date
if not given, extract all data
Returns
--------
daily_tg_messages : dict[datetime.date, list[TelegramMessagesModel]]
telegram messages extracted and daily grouped
"""
messages = self.extractor.extract(from_date=from_date)

daily_tg_messages: dict[date, list[TelegramMessagesModel]] = defaultdict(list)
for msg in messages:
msg_date = datetime.fromtimestamp(msg.message_created_at / 1000).date()
daily_tg_messages[msg_date].append(msg)

return daily_tg_messages
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .messages import TransformMessages
from .summarizer import SummarizeMessages
from .summary import TransformSummary
66 changes: 66 additions & 0 deletions dags/hivemind_etl_helpers/src/db/telegram/transform/summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from datetime import date, timedelta

from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel
from hivemind_etl_helpers.src.db.telegram.transform import TransformMessages
from hivemind_etl_helpers.src.utils.summary.summary_base import SummaryBase
from llama_index.core import Settings
from llama_index.core.response_synthesizers.base import BaseSynthesizer


class SummarizeMessages(SummaryBase):
def __init__(
self,
chat_id: str,
chat_name: str,
response_synthesizer: BaseSynthesizer | None = None,
verbose: bool = False,
**kwargs,
) -> None:
llm = kwargs.get("llm", Settings.llm)
super().__init__(llm, response_synthesizer, verbose)

self.message_transformer = TransformMessages(
chat_id=chat_id, chat_name=chat_name
)

def summarize_daily(
self, messages: dict[date, list[TelegramMessagesModel]]
) -> dict[date, str]:
"""
summarize the daily messages
Parameters
-----------
messages : dict[date, list[TelegramMessagesModel]]
daily grouped messages
Returns
---------
summaries : dict[date, str]
the summaries of each group
"""
summaries: dict[date, str] = {}

# per each daily messages
for day, msgs in messages.items():
# if no messages were available
if msgs:
continue

start_date = day.strftime("%d/%m/%Y")
end_date = (day + timedelta(days=1)).strftime("%d/%m/%Y")

day_documents = self.message_transformer.transform(messages=msgs)
summary = self._get_summary(
messages_document=day_documents,
summarization_query=(
"Please make a concise summary based only on the provided "
f"messages from a Telegram group chat from {start_date} to {end_date}."
" Please focus on main topics, decisions, and key information exchanged."
" Organize the output in one or multiple descriptive bullet points."
),
)

summaries[day] = summary

return summaries
40 changes: 40 additions & 0 deletions dags/hivemind_etl_helpers/src/db/telegram/transform/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from datetime import date, timedelta

from llama_index.core import Document


class TransformSummary:
def __init__(self) -> None:
pass

def transform(self, summaries: dict[date, str]) -> list[Document]:
"""
transform daily summaries to llama-index documents
Parameters
-----------
summaries : dict[date, str]
daily summaries
Returns
--------
summary_docs : list[llama_index.core.Document]
llama-index documents for summaries
"""
summary_docs: list[Document] = []

for day, summary in summaries.items():
# assigning an id so it would be consistent across different runs
document = Document(
doc_id="summary_"
+ day.strftime("%Y-%m-%d")
+ "_"
+ (day + timedelta(days=1)).strftime("%Y-%m-%d"),
text=summary,
metadata={
"date": day,
},
)
summary_docs.append(document)

return summary_docs
155 changes: 155 additions & 0 deletions dags/hivemind_etl_helpers/tests/unit/test_telegram_daily_summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import unittest
from datetime import date

from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel
from hivemind_etl_helpers.src.db.telegram.transform import SummarizeMessages
from llama_index.core import Document, MockEmbedding, Settings
from llama_index.core.llms import MockLLM


class TestTelegramSummarize(unittest.TestCase):
def setUp(self):
Settings.llm = MockLLM()
Settings.chunk_size = 256
Settings.embed_model = MockEmbedding(embed_dim=1024)

self.summarizer = SummarizeMessages(chat_id="temp_id", chat_name="temp_chat")

def test_summarize_empty_messages(self):
summaries = self.summarizer.summarize_daily(messages={})
self.assertEqual(summaries, {})

def test_summarize_single_day_single_document(self):
summaries = self.summarizer.summarize_daily(
messages={
date(2024, 6, 6): [
TelegramMessagesModel(
message_id=1,
message_text="message 1",
author_username="username1",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
],
}
)

for key, summary in summaries.items():
self.assertEqual(key, date(2024, 6, 6))
self.assertIsInstance(summary, str)

def test_summarize_single_day_multiple_documents(self):
summaries = self.summarizer.summarize_daily(
messages={
date(2024, 6, 6): [
TelegramMessagesModel(
message_id=1,
message_text="message 1",
author_username="username1",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
TelegramMessagesModel(
message_id=2,
message_text="message 2",
author_username="username2",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
],
}
)
for key, summary in summaries.items():
self.assertEqual(key, date(2024, 6, 6))
self.assertIsInstance(summary, str)

def test_summarize_multiple_days_multiple_document(self):
summaries = self.summarizer.summarize_daily(
messages={
date(2024, 6, 6): [
TelegramMessagesModel(
message_id=1,
message_text="message 1",
author_username="username1",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
TelegramMessagesModel(
message_id=2,
message_text="message 2",
author_username="username2",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
],
date(2024, 6, 7): [
TelegramMessagesModel(
message_id=1,
message_text="message 1",
author_username="username1",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
TelegramMessagesModel(
message_id=2,
message_text="message 2",
author_username="username2",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
],
date(2024, 6, 8): [
TelegramMessagesModel(
message_id=1,
message_text="message 1",
author_username="username1",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
TelegramMessagesModel(
message_id=2,
message_text="message 2",
author_username="username2",
message_created_at=1730801345876,
message_edited_at=1730801345877,
mentions=[],
reactors=[],
repliers=[],
),
],
}
)
for key, summary in summaries.items():
self.assertIn(
key,
[
date(2024, 6, 6),
date(2024, 6, 7),
date(2024, 6, 8),
],
)
self.assertIsInstance(summary, str)
Loading

0 comments on commit 292f14a

Please sign in to comment.