Skip to content

Commit

Permalink
Merge pull request #297 from TogetherCrew/feat/296-discourse-forum-en…
Browse files Browse the repository at this point in the history
…dpoint-update

feat: using endpoint for all discourse posts!
  • Loading branch information
amindadgar authored Oct 8, 2024
2 parents 27d751b + 79aa354 commit c4586cf
Show file tree
Hide file tree
Showing 16 changed files with 124 additions and 312 deletions.
8 changes: 2 additions & 6 deletions dags/analyzer_helper/discourse/extract_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def fetch_post_details(
operator = ">" if comparison == "gt" else ">="
where_clause = f"WHERE post.createdAt {operator} $createdAt"
query = f"""
MATCH (forum:DiscourseForum {{endpoint: $forum_endpoint}})
WITH forum
MATCH (post:DiscoursePost {{forumUuid: forum.uuid}})
MATCH (post:DiscoursePost {{endpoint: $forum_endpoint}})
{where_clause if where_clause else ""}
{"AND " if where_clause else "WHERE "}
post.createdAt is NOT NULL
Expand Down Expand Up @@ -117,9 +115,7 @@ def get_latest_post_created_at(self, forum_endpoint: str) -> Optional[str]:
Optional[str]: The created_at timestamp of the latest post, or None if no posts are found.
"""
query = """
MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
WITH forum
(post:DiscoursePost {forumUuid: forum.uuid})
(post:DiscoursePost {endpoint: $forum_endpoint})
RETURN post.createdAt AS created_at
ORDER BY post.createdAt DESC
LIMIT 1
Expand Down
3 changes: 1 addition & 2 deletions dags/analyzer_helper/discourse/extract_raw_members.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def fetch_member_details(self, start_date: datetime = None):
:return: List of dictionaries containing member details.
"""
query = """
MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
MATCH (user:DiscourseUser)-[:HAS_JOINED]->(forum)
MATCH (user:DiscourseUser {endpoint: $forum_endpoint})
WHERE user.username IS NOT NULL
"""

Expand Down
4 changes: 1 addition & 3 deletions dags/analyzer_helper/discourse/fetch_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ def fetch_all(self) -> list[float]:
a list of category ids
"""
query = """
MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
WITH forum.uuid as forumUuid
MATCH (c:DiscourseCategory {forumUuid: forumUuid})
MATCH (c:DiscourseCategory {endpoint: $forum_endpoint})
RETURN c.id as category_ids
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,15 @@ def setUpClass(cls):
with cls.driver.session() as session:
session.run(
"""
CREATE (f:DiscourseForum {endpoint: $endpoint, uuid: 'forum-uuid'}),
(u1:DiscourseUser {id: 'user1', name: 'User One'}),
CREATE (u1:DiscourseUser {id: 'user1', name: 'User One'}),
(u2:DiscourseUser {id: 'user2', name: 'User Two'}),
(p1:DiscoursePost
{
id: '1',
content: 'Post 1',
createdAt: '2023-01-01T00:00:00Z',
topicId: 'topic-uuid',
forumUuid: 'forum-uuid',
endpoint: 'http://test_forum',
raw: "Sample Text 1",
postNumber: 1.0
}
Expand All @@ -44,12 +43,12 @@ def setUpClass(cls):
content: 'Post 2',
createdAt: '2023-01-02T00:00:00Z',
topicId: 'topic-uuid',
forumUuid: 'forum-uuid',
endpoint: 'http://test_forum',
raw: "Sample Text 2",
postNumber: 2.0
}
),
(t:DiscourseTopic {id: 'topic-uuid', forumUuid: 'forum-uuid'}),
(t:DiscourseTopic {id: 'topic-uuid', endpoint: 'http://test_forum'}),
(c:DiscourseCategory {id: 'category1', name: 'Category 1'}),
(p1)<-[:HAS_POST]-(t),
(p2)<-[:HAS_POST]-(t),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,19 @@ def setUp(self):
)

with self.driver.session() as session:
result_forum = session.run(
"""
CREATE (f:DiscourseForum {endpoint: $forum_endpoint})
RETURN id(f) AS id
""",
forum_endpoint=self.test_forum_endpoint,
)
self.forum_id = result_forum.single()["id"]
# Create user1 and relate to forum
result1 = session.run(
"""
MATCH (f:DiscourseForum {endpoint: $forum_endpoint})
CREATE (
u:DiscourseUser {
id: 'user1',
avatarTemplate: 'avatar1',
createdAt: '2023-07-01',
name: 'user1name',
username: 'username1'
username: 'username1',
endpoint: $forum_endpoint
}
)
-[:HAS_JOINED]->(f)
CREATE (u)-[:HAS_BADGE]->(:Badge {id: 'badge1'})
RETURN id(u) AS id
""",
Expand All @@ -65,16 +56,15 @@ def setUp(self):
# Create user2 and relate to forum
result2 = session.run(
"""
MATCH (f:DiscourseForum {endpoint: $forum_endpoint})
CREATE (u:DiscourseUser {
id: 'user2',
avatarTemplate: 'avatar2',
createdAt: '2023-07-02',
name: 'user2name',
username: 'username2'
username: 'username2',
endpoint: $forum_endpoint
}
)
-[:HAS_JOINED]->(f)
CREATE (u)-[:HAS_BADGE]->(:Badge {id: 'badge2'})
RETURN id(u) AS id
""",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,6 @@ def setUp(self):
self.driver = neo4jConnection.connect_neo4j()

self.endpoint = "endpoint.com"
self.forum_uuid = 123
with self.driver.session() as session:
session.run(
"CREATE (forum:DiscourseForum {uuid: $f_uuid, endpoint: $forum_endpoint})",
{"forum_endpoint": self.endpoint, "f_uuid": self.forum_uuid},
)
self.fetcher = FetchDiscourseCategories(self.endpoint)

def tearDown(self):
Expand All @@ -34,15 +28,15 @@ def test_single_category_available(self):
"""
CREATE (c:DiscourseCategory
{
forumUuid: $forum_uuid,
endpoint: $forum_endpoint,
color: "0088CC",
name: "test category",
descriptionText: "category description",
id: 1.0
}
)
""",
{"forum_uuid": self.forum_uuid},
{"forum_endpoint": self.endpoint},
)

category_ids = self.fetcher.fetch_all()
Expand All @@ -56,7 +50,7 @@ def test_multiple_categories_available(self):
"""
CREATE (:DiscourseCategory
{
forumUuid: $forum_uuid,
endpoint: $forum_endpoint,
color: "0088CC",
name: "test category",
descriptionText: "category description",
Expand All @@ -65,7 +59,7 @@ def test_multiple_categories_available(self):
)
CREATE (:DiscourseCategory
{
forumUuid: $forum_uuid,
endpoint: $forum_endpoint,
color: "0088CC",
name: "test category 2",
descriptionText: "category description 2",
Expand All @@ -74,15 +68,15 @@ def test_multiple_categories_available(self):
)
CREATE (:DiscourseCategory
{
forumUuid: $forum_uuid,
endpoint: $forum_endpoint,
color: "0088CC",
name: "test category 3",
descriptionText: "category description 3",
id: 3.0
}
)
""",
{"forum_uuid": self.forum_uuid},
{"forum_endpoint": self.endpoint},
)

category_ids = self.fetcher.fetch_all()
Expand Down
42 changes: 13 additions & 29 deletions dags/hivemind_etl_helpers/discourse_summary_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
DiscourseSummary,
)
from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
from llama_index.core import Document, Settings
Expand Down Expand Up @@ -40,32 +39,20 @@ def process_discourse_summary(
prefix = f"COMMUNITYID: {community_id} "
logging.info(prefix + "Processing summaries")

forum_uuid = get_forum_uuid(forum_endpoint=forum_endpoint)

# The below commented lines are for debugging
# forum_uuid = [
# {
# "uuid": "851d8069-fc3a-415a-b684-1261d4404092",
# }
# ]
forum_id = forum_uuid[0]["uuid"]
forum_endpoint = forum_endpoint
process_forum(
forum_id=forum_id,
forum_endpoint=forum_endpoint,
community_id=community_id,
dbname=dbname,
log_prefix=f"{prefix}ForumId: {forum_id}",
forum_endpoint=forum_endpoint,
log_prefix=f"{prefix}Forum endpoint: {forum_endpoint}",
from_starting_date=from_starting_date,
)


def process_forum(
forum_id: str,
forum_endpoint: str,
community_id: str,
dbname: str,
log_prefix: str,
forum_endpoint: str,
from_starting_date: datetime,
):
"""
Expand All @@ -74,16 +61,14 @@ def process_forum(
Parameters
------------
forum_id : str
forum_endpoint : str
the forum that the community has
community_id : str
the community that the forum relates to
dbname : str
the data of the community saved within the postgres database `dbname`
log_predix : str
the logging prefix to print out
forum_endpoint : str
the DiscourseForum endpoint for document checking
from_starting_date : datetime
the time to start processing documents
"""
Expand Down Expand Up @@ -119,17 +104,18 @@ def process_forum(
f"{log_prefix} Fetching raw data and converting to llama_index.Documents"
)

raw_data_grouped = fetch_raw_posts_grouped(forum_id=forum_id, from_date=from_date)
raw_data_grouped = fetch_raw_posts_grouped(
forum_endpoint=forum_endpoint, from_date=from_date
)

if raw_data_grouped != []:
(
topic_summary_documents,
category_summary_documenets,
daily_summary_documents,
) = get_summary_documents(
forum_id=forum_id,
raw_data_grouped=raw_data_grouped,
forum_endpoint=forum_endpoint,
raw_data_grouped=raw_data_grouped,
)

node_parser = configure_node_parser(chunk_size=chunk_size)
Expand Down Expand Up @@ -164,19 +150,18 @@ def process_forum(


def get_summary_documents(
forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str
forum_endpoint: str,
raw_data_grouped: list[Record],
) -> tuple[list[Document], list[Document], list[Document],]:
"""
prepare the summary documents for discourse based on given raw data
Parameters
------------
forum_id : str
the forum uuid just for logging
forum_endpoint : str
the discourse forum endpoint to use its data
raw_data_grouped : list[Record]
a list of neo4j records
forum_endpoint : str
the endpoint of the forum id
Returns
--------
Expand All @@ -189,9 +174,8 @@ def get_summary_documents(
"""

discourse_summary = DiscourseSummary(
forum_id=forum_id,
response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
forum_endpoint=forum_endpoint,
response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
)

summarization_query = (
Expand Down
29 changes: 8 additions & 21 deletions dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from hivemind_etl_helpers.src.db.discourse.raw_post_to_documents import (
fetch_discourse_documents,
)
from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
from hivemind_etl_helpers.src.utils.check_documents import check_documents
from llama_index.core import Settings
Expand Down Expand Up @@ -34,32 +33,20 @@ def process_discourse_vectorstore(
prefix = f"COMMUNITYID: {community_id} "
logging.info(prefix)

forum_uuid = get_forum_uuid(forum_endpoint=forum_endpoint)

# The below commented lines are for debugging
# forum_uuid = [
# {
# "uuid": "851d8069-fc3a-415a-b684-1261d4404092",
# }
# ]
forum_id = forum_uuid[0]["uuid"]
forum_endpoint = forum_endpoint
process_forum(
forum_id=forum_id,
forum_endpoint=forum_endpoint,
community_id=community_id,
dbname=dbname,
log_prefix=f"{prefix}ForumId: {forum_id}",
forum_endpoint=forum_endpoint,
log_prefix=f"{prefix}ForumId: {forum_endpoint}",
from_starting_date=from_starting_date,
)


def process_forum(
forum_id: str,
forum_endpoint: str,
community_id: str,
dbname: str,
log_prefix: str,
forum_endpoint: str,
from_starting_date: datetime,
):
"""
Expand All @@ -69,16 +56,14 @@ def process_forum(
Parameters
------------
forum_id : str
the forum that the community has
forum_endpoint : str
the DiscourseForum endpoint for document checking
community_id : str
the community that the forum relates to
dbname : str
the data of the community saved within the postgres database `dbname`
log_predix : str
the logging prefix to print out
forum_endpoint : str
the DiscourseForum endpoint for document checking
from_starting_date : datetime
the time to start processing documents
"""
Expand Down Expand Up @@ -106,7 +91,9 @@ def process_forum(
else:
from_date = from_last_saved_date

documents = fetch_discourse_documents(forum_id=forum_id, from_date=from_date)
documents = fetch_discourse_documents(
forum_endpoint=forum_endpoint, from_date=from_date
)

node_parser = configure_node_parser(chunk_size=chunk_size)
pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
Expand Down
Loading

0 comments on commit c4586cf

Please sign in to comment.