Merge pull request #297 from TogetherCrew/feat/296-discourse-forum-en…

…dpoint-update feat: using endpoint for all discourse posts!
TogetherCrew · Oct 8, 2024 · c4586cf · c4586cf
2 parents 27d751b + 79aa354
commit c4586cf
Show file tree

Hide file tree

Showing 16 changed files with 124 additions and 312 deletions.
diff --git a/dags/analyzer_helper/discourse/extract_raw_data.py b/dags/analyzer_helper/discourse/extract_raw_data.py
@@ -48,9 +48,7 @@ def fetch_post_details(
             operator = ">" if comparison == "gt" else ">="
             where_clause = f"WHERE post.createdAt {operator} $createdAt"
         query = f"""
-        MATCH (forum:DiscourseForum {{endpoint: $forum_endpoint}})
-        WITH forum
-        MATCH (post:DiscoursePost {{forumUuid: forum.uuid}})
+        MATCH (post:DiscoursePost {{endpoint: $forum_endpoint}})
         {where_clause if where_clause else ""}
         {"AND " if where_clause else "WHERE "}
         post.createdAt is NOT NULL
@@ -117,9 +115,7 @@ def get_latest_post_created_at(self, forum_endpoint: str) -> Optional[str]:
             Optional[str]: The created_at timestamp of the latest post, or None if no posts are found.
         """
         query = """
-        MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
-        WITH forum
-        (post:DiscoursePost {forumUuid: forum.uuid})
+        (post:DiscoursePost {endpoint: $forum_endpoint})
         RETURN post.createdAt AS created_at
         ORDER BY post.createdAt DESC
         LIMIT 1

diff --git a/dags/analyzer_helper/discourse/extract_raw_members.py b/dags/analyzer_helper/discourse/extract_raw_members.py
@@ -34,8 +34,7 @@ def fetch_member_details(self, start_date: datetime = None):
         :return: List of dictionaries containing member details.
         """
         query = """
-        MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
-        MATCH (user:DiscourseUser)-[:HAS_JOINED]->(forum)
+        MATCH (user:DiscourseUser {endpoint: $forum_endpoint})
         WHERE user.username IS NOT NULL
         """
 

diff --git a/dags/analyzer_helper/discourse/fetch_categories.py b/dags/analyzer_helper/discourse/fetch_categories.py
@@ -19,9 +19,7 @@ def fetch_all(self) -> list[float]:
             a list of category ids
         """
         query = """
-        MATCH (forum:DiscourseForum {endpoint: $forum_endpoint})
-        WITH forum.uuid as forumUuid
-        MATCH (c:DiscourseCategory {forumUuid: forumUuid})
+        MATCH (c:DiscourseCategory {endpoint: $forum_endpoint})
         RETURN c.id as category_ids
         """
 

diff --git a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_data.py
@@ -24,16 +24,15 @@ def setUpClass(cls):
         with cls.driver.session() as session:
             session.run(
                 """
-            CREATE (f:DiscourseForum {endpoint: $endpoint, uuid: 'forum-uuid'}),
-                (u1:DiscourseUser {id: 'user1', name: 'User One'}),
+            CREATE (u1:DiscourseUser {id: 'user1', name: 'User One'}),
                 (u2:DiscourseUser {id: 'user2', name: 'User Two'}),
                 (p1:DiscoursePost 
                     {
                         id: '1',
                         content: 'Post 1',
                         createdAt: '2023-01-01T00:00:00Z',
                         topicId: 'topic-uuid',
-                        forumUuid: 'forum-uuid',
+                        endpoint: 'http://test_forum',
                         raw: "Sample Text 1",
                         postNumber: 1.0
                     }
@@ -44,12 +43,12 @@ def setUpClass(cls):
                         content: 'Post 2',
                         createdAt: '2023-01-02T00:00:00Z',
                         topicId: 'topic-uuid',
-                        forumUuid: 'forum-uuid',
+                        endpoint: 'http://test_forum',
                         raw: "Sample Text 2",
                         postNumber: 2.0
                     }
                 ),
-                (t:DiscourseTopic {id: 'topic-uuid', forumUuid: 'forum-uuid'}),
+                (t:DiscourseTopic {id: 'topic-uuid', endpoint: 'http://test_forum'}),
                 (c:DiscourseCategory {id: 'category1', name: 'Category 1'}),
                 (p1)<-[:HAS_POST]-(t),
                 (p2)<-[:HAS_POST]-(t),

diff --git a/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_members.py b/dags/analyzer_helper/tests/integration/test_discourse_extract_raw_members.py
@@ -34,28 +34,19 @@ def setUp(self):
         )
 
         with self.driver.session() as session:
-            result_forum = session.run(
-                """
-                CREATE (f:DiscourseForum {endpoint: $forum_endpoint})
-                RETURN id(f) AS id
-                """,
-                forum_endpoint=self.test_forum_endpoint,
-            )
-            self.forum_id = result_forum.single()["id"]
             # Create user1 and relate to forum
             result1 = session.run(
                 """
-                MATCH (f:DiscourseForum {endpoint: $forum_endpoint})
                 CREATE (
                     u:DiscourseUser {
                         id: 'user1',
                         avatarTemplate: 'avatar1',
                         createdAt: '2023-07-01',
                         name: 'user1name',
-                        username: 'username1'
+                        username: 'username1',
+                        endpoint: $forum_endpoint
                     }
                 )
-                -[:HAS_JOINED]->(f)
                 CREATE (u)-[:HAS_BADGE]->(:Badge {id: 'badge1'})
                 RETURN id(u) AS id
                 """,
@@ -65,16 +56,15 @@ def setUp(self):
             # Create user2 and relate to forum
             result2 = session.run(
                 """
-                MATCH (f:DiscourseForum {endpoint: $forum_endpoint})
                 CREATE (u:DiscourseUser {
                         id: 'user2',
                         avatarTemplate: 'avatar2',
                         createdAt: '2023-07-02',
                         name: 'user2name',
-                        username: 'username2'
+                        username: 'username2',
+                        endpoint: $forum_endpoint
                     }
                 )
-                -[:HAS_JOINED]->(f)
                 CREATE (u)-[:HAS_BADGE]->(:Badge {id: 'badge2'})
                 RETURN id(u) AS id
                 """,

diff --git a/dags/analyzer_helper/tests/integration/test_discourse_fetch_categories.py b/dags/analyzer_helper/tests/integration/test_discourse_fetch_categories.py
@@ -10,12 +10,6 @@ def setUp(self):
         self.driver = neo4jConnection.connect_neo4j()
 
         self.endpoint = "endpoint.com"
-        self.forum_uuid = 123
-        with self.driver.session() as session:
-            session.run(
-                "CREATE (forum:DiscourseForum {uuid: $f_uuid, endpoint: $forum_endpoint})",
-                {"forum_endpoint": self.endpoint, "f_uuid": self.forum_uuid},
-            )
         self.fetcher = FetchDiscourseCategories(self.endpoint)
 
     def tearDown(self):
@@ -34,15 +28,15 @@ def test_single_category_available(self):
                 """
                 CREATE (c:DiscourseCategory 
                     {
-                        forumUuid: $forum_uuid,
+                        endpoint: $forum_endpoint,
                         color: "0088CC",
                         name: "test category",
                         descriptionText: "category description",
                         id: 1.0
                     }
                 )
                 """,
-                {"forum_uuid": self.forum_uuid},
+                {"forum_endpoint": self.endpoint},
             )
 
         category_ids = self.fetcher.fetch_all()
@@ -56,7 +50,7 @@ def test_multiple_categories_available(self):
                 """
                 CREATE (:DiscourseCategory 
                     {
-                        forumUuid: $forum_uuid,
+                        endpoint: $forum_endpoint,
                         color: "0088CC",
                         name: "test category",
                         descriptionText: "category description",
@@ -65,7 +59,7 @@ def test_multiple_categories_available(self):
                 )
                 CREATE (:DiscourseCategory 
                     {
-                        forumUuid: $forum_uuid,
+                        endpoint: $forum_endpoint,
                         color: "0088CC",
                         name: "test category 2",
                         descriptionText: "category description 2",
@@ -74,15 +68,15 @@ def test_multiple_categories_available(self):
                 )
                 CREATE (:DiscourseCategory 
                     {
-                        forumUuid: $forum_uuid,
+                        endpoint: $forum_endpoint,
                         color: "0088CC",
                         name: "test category 3",
                         descriptionText: "category description 3",
                         id: 3.0
                     }
                 )
                 """,
-                {"forum_uuid": self.forum_uuid},
+                {"forum_endpoint": self.endpoint},
             )
 
         category_ids = self.fetcher.fetch_all()

diff --git a/dags/hivemind_etl_helpers/discourse_summary_etl.py b/dags/hivemind_etl_helpers/discourse_summary_etl.py
@@ -7,7 +7,6 @@
 from hivemind_etl_helpers.src.db.discourse.summary.prepare_summary import (
     DiscourseSummary,
 )
-from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.sort_summary_docs import sort_summaries_daily
 from llama_index.core import Document, Settings
@@ -40,32 +39,20 @@ def process_discourse_summary(
     prefix = f"COMMUNITYID: {community_id} "
     logging.info(prefix + "Processing summaries")
 
-    forum_uuid = get_forum_uuid(forum_endpoint=forum_endpoint)
-
-    # The below commented lines are for debugging
-    # forum_uuid = [
-    #     {
-    #         "uuid": "851d8069-fc3a-415a-b684-1261d4404092",
-    #     }
-    # ]
-    forum_id = forum_uuid[0]["uuid"]
-    forum_endpoint = forum_endpoint
     process_forum(
-        forum_id=forum_id,
+        forum_endpoint=forum_endpoint,
         community_id=community_id,
         dbname=dbname,
-        log_prefix=f"{prefix}ForumId: {forum_id}",
-        forum_endpoint=forum_endpoint,
+        log_prefix=f"{prefix}Forum endpoint: {forum_endpoint}",
         from_starting_date=from_starting_date,
     )
 
 
 def process_forum(
-    forum_id: str,
+    forum_endpoint: str,
     community_id: str,
     dbname: str,
     log_prefix: str,
-    forum_endpoint: str,
     from_starting_date: datetime,
 ):
     """
@@ -74,16 +61,14 @@ def process_forum(
 
     Parameters
     ------------
-    forum_id : str
+    forum_endpoint : str
         the forum that the community has
     community_id : str
         the community that the forum relates to
     dbname : str
         the data of the community saved within the postgres database `dbname`
     log_predix : str
         the logging prefix to print out
-    forum_endpoint : str
-        the DiscourseForum endpoint for document checking
     from_starting_date : datetime
         the time to start processing documents
     """
@@ -119,17 +104,18 @@ def process_forum(
         f"{log_prefix} Fetching raw data and converting to llama_index.Documents"
     )
 
-    raw_data_grouped = fetch_raw_posts_grouped(forum_id=forum_id, from_date=from_date)
+    raw_data_grouped = fetch_raw_posts_grouped(
+        forum_endpoint=forum_endpoint, from_date=from_date
+    )
 
     if raw_data_grouped != []:
         (
             topic_summary_documents,
             category_summary_documenets,
             daily_summary_documents,
         ) = get_summary_documents(
-            forum_id=forum_id,
-            raw_data_grouped=raw_data_grouped,
             forum_endpoint=forum_endpoint,
+            raw_data_grouped=raw_data_grouped,
         )
 
         node_parser = configure_node_parser(chunk_size=chunk_size)
@@ -164,19 +150,18 @@ def process_forum(
 
 
 def get_summary_documents(
-    forum_id: str, raw_data_grouped: list[Record], forum_endpoint: str
+    forum_endpoint: str,
+    raw_data_grouped: list[Record],
 ) -> tuple[list[Document], list[Document], list[Document],]:
     """
     prepare the summary documents for discourse based on given raw data
 
     Parameters
     ------------
-    forum_id : str
-        the forum uuid just for logging
+    forum_endpoint : str
+        the discourse forum endpoint to use its data
     raw_data_grouped : list[Record]
         a list of neo4j records
-    forum_endpoint : str
-        the endpoint of the forum id
 
     Returns
     --------
@@ -189,9 +174,8 @@ def get_summary_documents(
     """
 
     discourse_summary = DiscourseSummary(
-        forum_id=forum_id,
-        response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
         forum_endpoint=forum_endpoint,
+        response_synthesizer=get_response_synthesizer(response_mode="tree_summarize"),
     )
 
     summarization_query = (

diff --git a/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py b/dags/hivemind_etl_helpers/discourse_vectorstore_etl.py
@@ -4,7 +4,6 @@
 from hivemind_etl_helpers.src.db.discourse.raw_post_to_documents import (
     fetch_discourse_documents,
 )
-from hivemind_etl_helpers.src.db.discourse.utils.get_forums import get_forum_uuid
 from hivemind_etl_helpers.src.document_node_parser import configure_node_parser
 from hivemind_etl_helpers.src.utils.check_documents import check_documents
 from llama_index.core import Settings
@@ -34,32 +33,20 @@ def process_discourse_vectorstore(
     prefix = f"COMMUNITYID: {community_id} "
     logging.info(prefix)
 
-    forum_uuid = get_forum_uuid(forum_endpoint=forum_endpoint)
-
-    # The below commented lines are for debugging
-    # forum_uuid = [
-    #     {
-    #         "uuid": "851d8069-fc3a-415a-b684-1261d4404092",
-    #     }
-    # ]
-    forum_id = forum_uuid[0]["uuid"]
-    forum_endpoint = forum_endpoint
     process_forum(
-        forum_id=forum_id,
+        forum_endpoint=forum_endpoint,
         community_id=community_id,
         dbname=dbname,
-        log_prefix=f"{prefix}ForumId: {forum_id}",
-        forum_endpoint=forum_endpoint,
+        log_prefix=f"{prefix}ForumId: {forum_endpoint}",
         from_starting_date=from_starting_date,
     )
 
 
 def process_forum(
-    forum_id: str,
+    forum_endpoint: str,
     community_id: str,
     dbname: str,
     log_prefix: str,
-    forum_endpoint: str,
     from_starting_date: datetime,
 ):
     """
@@ -69,16 +56,14 @@ def process_forum(
 
     Parameters
     ------------
-    forum_id : str
-        the forum that the community has
+    forum_endpoint : str
+        the DiscourseForum endpoint for document checking
     community_id : str
         the community that the forum relates to
     dbname : str
         the data of the community saved within the postgres database `dbname`
     log_predix : str
         the logging prefix to print out
-    forum_endpoint : str
-        the DiscourseForum endpoint for document checking
     from_starting_date : datetime
         the time to start processing documents
     """
@@ -106,7 +91,9 @@ def process_forum(
     else:
         from_date = from_last_saved_date
 
-    documents = fetch_discourse_documents(forum_id=forum_id, from_date=from_date)
+    documents = fetch_discourse_documents(
+        forum_endpoint=forum_endpoint, from_date=from_date
+    )
 
     node_parser = configure_node_parser(chunk_size=chunk_size)
     pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)