-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #314 from TogetherCrew/feat/313-telegram-summarizer
Telegram Summarizer ETL Added!
- Loading branch information
Showing
9 changed files
with
576 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .messages import ExtractMessages | ||
from .messages_daily import ExtractMessagesDaily | ||
from .tc_chats import TelegramChats |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
dags/hivemind_etl_helpers/src/db/telegram/extract/messages_daily.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from collections import defaultdict | ||
from datetime import date, datetime | ||
|
||
from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel | ||
|
||
from . import ExtractMessages | ||
|
||
|
||
class ExtractMessagesDaily: | ||
def __init__(self, chat_id: str) -> None: | ||
self.extractor = ExtractMessages(chat_id=chat_id) | ||
|
||
def extract( | ||
self, from_date: datetime | None = None | ||
) -> dict[date, list[TelegramMessagesModel]]: | ||
""" | ||
extract messages daily | ||
Parameters | ||
----------- | ||
from_date : datetime | None | ||
extract from a specific date | ||
if not given, extract all data | ||
Returns | ||
-------- | ||
daily_tg_messages : dict[datetime.date, list[TelegramMessagesModel]] | ||
telegram messages extracted and daily grouped | ||
""" | ||
messages = self.extractor.extract(from_date=from_date) | ||
|
||
daily_tg_messages: dict[date, list[TelegramMessagesModel]] = defaultdict(list) | ||
for msg in messages: | ||
msg_date = datetime.fromtimestamp(msg.message_created_at / 1000).date() | ||
daily_tg_messages[msg_date].append(msg) | ||
|
||
return daily_tg_messages |
2 changes: 2 additions & 0 deletions
2
dags/hivemind_etl_helpers/src/db/telegram/transform/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
from .messages import TransformMessages | ||
from .summarizer import SummarizeMessages | ||
from .summary import TransformSummary |
66 changes: 66 additions & 0 deletions
66
dags/hivemind_etl_helpers/src/db/telegram/transform/summarizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from datetime import date, timedelta | ||
|
||
from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel | ||
from hivemind_etl_helpers.src.db.telegram.transform import TransformMessages | ||
from hivemind_etl_helpers.src.utils.summary.summary_base import SummaryBase | ||
from llama_index.core import Settings | ||
from llama_index.core.response_synthesizers.base import BaseSynthesizer | ||
|
||
|
||
class SummarizeMessages(SummaryBase): | ||
def __init__( | ||
self, | ||
chat_id: str, | ||
chat_name: str, | ||
response_synthesizer: BaseSynthesizer | None = None, | ||
verbose: bool = False, | ||
**kwargs, | ||
) -> None: | ||
llm = kwargs.get("llm", Settings.llm) | ||
super().__init__(llm, response_synthesizer, verbose) | ||
|
||
self.message_transformer = TransformMessages( | ||
chat_id=chat_id, chat_name=chat_name | ||
) | ||
|
||
def summarize_daily( | ||
self, messages: dict[date, list[TelegramMessagesModel]] | ||
) -> dict[date, str]: | ||
""" | ||
summarize the daily messages | ||
Parameters | ||
----------- | ||
messages : dict[date, list[TelegramMessagesModel]] | ||
daily grouped messages | ||
Returns | ||
--------- | ||
summaries : dict[date, str] | ||
the summaries of each group | ||
""" | ||
summaries: dict[date, str] = {} | ||
|
||
# per each daily messages | ||
for day, msgs in messages.items(): | ||
# if no messages were available | ||
if msgs: | ||
continue | ||
|
||
start_date = day.strftime("%d/%m/%Y") | ||
end_date = (day + timedelta(days=1)).strftime("%d/%m/%Y") | ||
|
||
day_documents = self.message_transformer.transform(messages=msgs) | ||
summary = self._get_summary( | ||
messages_document=day_documents, | ||
summarization_query=( | ||
"Please make a concise summary based only on the provided " | ||
f"messages from a Telegram group chat from {start_date} to {end_date}." | ||
" Please focus on main topics, decisions, and key information exchanged." | ||
" Organize the output in one or multiple descriptive bullet points." | ||
), | ||
) | ||
|
||
summaries[day] = summary | ||
|
||
return summaries |
40 changes: 40 additions & 0 deletions
40
dags/hivemind_etl_helpers/src/db/telegram/transform/summary.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from datetime import date, timedelta | ||
|
||
from llama_index.core import Document | ||
|
||
|
||
class TransformSummary: | ||
def __init__(self) -> None: | ||
pass | ||
|
||
def transform(self, summaries: dict[date, str]) -> list[Document]: | ||
""" | ||
transform daily summaries to llama-index documents | ||
Parameters | ||
----------- | ||
summaries : dict[date, str] | ||
daily summaries | ||
Returns | ||
-------- | ||
summary_docs : list[llama_index.core.Document] | ||
llama-index documents for summaries | ||
""" | ||
summary_docs: list[Document] = [] | ||
|
||
for day, summary in summaries.items(): | ||
# assigning an id so it would be consistent across different runs | ||
document = Document( | ||
doc_id="summary_" | ||
+ day.strftime("%Y-%m-%d") | ||
+ "_" | ||
+ (day + timedelta(days=1)).strftime("%Y-%m-%d"), | ||
text=summary, | ||
metadata={ | ||
"date": day, | ||
}, | ||
) | ||
summary_docs.append(document) | ||
|
||
return summary_docs |
155 changes: 155 additions & 0 deletions
155
dags/hivemind_etl_helpers/tests/unit/test_telegram_daily_summarize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import unittest | ||
from datetime import date | ||
|
||
from hivemind_etl_helpers.src.db.telegram.schema import TelegramMessagesModel | ||
from hivemind_etl_helpers.src.db.telegram.transform import SummarizeMessages | ||
from llama_index.core import Document, MockEmbedding, Settings | ||
from llama_index.core.llms import MockLLM | ||
|
||
|
||
class TestTelegramSummarize(unittest.TestCase): | ||
def setUp(self): | ||
Settings.llm = MockLLM() | ||
Settings.chunk_size = 256 | ||
Settings.embed_model = MockEmbedding(embed_dim=1024) | ||
|
||
self.summarizer = SummarizeMessages(chat_id="temp_id", chat_name="temp_chat") | ||
|
||
def test_summarize_empty_messages(self): | ||
summaries = self.summarizer.summarize_daily(messages={}) | ||
self.assertEqual(summaries, {}) | ||
|
||
def test_summarize_single_day_single_document(self): | ||
summaries = self.summarizer.summarize_daily( | ||
messages={ | ||
date(2024, 6, 6): [ | ||
TelegramMessagesModel( | ||
message_id=1, | ||
message_text="message 1", | ||
author_username="username1", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
], | ||
} | ||
) | ||
|
||
for key, summary in summaries.items(): | ||
self.assertEqual(key, date(2024, 6, 6)) | ||
self.assertIsInstance(summary, str) | ||
|
||
def test_summarize_single_day_multiple_documents(self): | ||
summaries = self.summarizer.summarize_daily( | ||
messages={ | ||
date(2024, 6, 6): [ | ||
TelegramMessagesModel( | ||
message_id=1, | ||
message_text="message 1", | ||
author_username="username1", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
TelegramMessagesModel( | ||
message_id=2, | ||
message_text="message 2", | ||
author_username="username2", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
], | ||
} | ||
) | ||
for key, summary in summaries.items(): | ||
self.assertEqual(key, date(2024, 6, 6)) | ||
self.assertIsInstance(summary, str) | ||
|
||
def test_summarize_multiple_days_multiple_document(self): | ||
summaries = self.summarizer.summarize_daily( | ||
messages={ | ||
date(2024, 6, 6): [ | ||
TelegramMessagesModel( | ||
message_id=1, | ||
message_text="message 1", | ||
author_username="username1", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
TelegramMessagesModel( | ||
message_id=2, | ||
message_text="message 2", | ||
author_username="username2", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
], | ||
date(2024, 6, 7): [ | ||
TelegramMessagesModel( | ||
message_id=1, | ||
message_text="message 1", | ||
author_username="username1", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
TelegramMessagesModel( | ||
message_id=2, | ||
message_text="message 2", | ||
author_username="username2", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
], | ||
date(2024, 6, 8): [ | ||
TelegramMessagesModel( | ||
message_id=1, | ||
message_text="message 1", | ||
author_username="username1", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
TelegramMessagesModel( | ||
message_id=2, | ||
message_text="message 2", | ||
author_username="username2", | ||
message_created_at=1730801345876, | ||
message_edited_at=1730801345877, | ||
mentions=[], | ||
reactors=[], | ||
repliers=[], | ||
), | ||
], | ||
} | ||
) | ||
for key, summary in summaries.items(): | ||
self.assertIn( | ||
key, | ||
[ | ||
date(2024, 6, 6), | ||
date(2024, 6, 7), | ||
date(2024, 6, 8), | ||
], | ||
) | ||
self.assertIsInstance(summary, str) |
Oops, something went wrong.