TogetherCrew · JacobCWBillings · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024
diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py
@@ -3,9 +3,9 @@
 
 from dotenv import load_dotenv
 from hivemind_etl_helpers.src.db.github.extract import (
+    GithubExtraction,
     fetch_comments,
     fetch_commits,
-    fetch_issues,
     fetch_pull_requests,
 )
 from hivemind_etl_helpers.src.db.github.github_organization_repos import (
@@ -61,9 +61,13 @@ def process_github_vectorstore(
     logging.info(f"{len(repository_ids)} repositories to fetch data from!")
 
     # EXTRACT
+    extractor = GithubExtraction()
+
     github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date)
     github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date)
-    github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date)
+    github_issues = extractor.fetch_issues(
+        repository_id=repository_ids, from_date=from_date
+    )
     github_prs = fetch_pull_requests(
         repository_id=repository_ids,
         from_date_created=from_starting_date,

diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py
@@ -1,5 +1,5 @@
 # flake8: noqa
 from .comments import fetch_comments
 from .commit import fetch_commits
-from .issues import fetch_issues
+from .github_extraction import GithubExtraction
 from .pull_requests import fetch_pull_requests
diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py
@@ -0,0 +1,39 @@
+from datetime import datetime
+
+import neo4j
+from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction
+from hivemind_etl_helpers.src.db.github.schema import GitHubIssue
+
+
+class GithubExtraction:
+    def __init__(self):
+        # to be uncommented once other pull requests
+        # regarding `extraction` are ready
+        # self.commits_extraction = GithubCommitExtraction()
+        # self.pull_requests_extraction = GithubPullRequestsExtraction()
+        # self.comment_extraction = GitHubCommentExtraction()
+        self.issue_extraction = GithubIssueExtraction()
+
+    def _fetch_raw_issues(
+        self, repository_id: list[int], from_date: datetime | None = None, **kwargs
+    ) -> list[neo4j._data.Record]:
+        return self.issue_extraction._fetch_raw_issues(
+            repository_id, from_date, **kwargs
+        )
+
+    def fetch_issues(
+        self, repository_id: list[int], from_date: datetime | None = None, **kwargs
+    ) -> list[GitHubIssue]:
+        return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs)
+
+    def _fetch_raw_issue_ids(
+        self, repository_id: list[int], from_date: datetime | None = None, **kwargs
+    ) -> list[neo4j._data.Record]:
+        return self.issue_extraction._fetch_raw_issue_ids(
+            repository_id, from_date, **kwargs
+        )
+
+    def fetch_issue_ids(
+        self, repository_id: list[int], from_date: datetime | None = None, **kwargs
+    ) -> list[GitHubIssue]:
+        return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs)
diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py
@@ -2,96 +2,187 @@
 
 import neo4j
 from github.neo4j_storage.neo4j_connection import Neo4jConnection
-from hivemind_etl_helpers.src.db.github.schema import GitHubIssue
-
-
-def fetch_raw_issues(
-    repository_id: list[int],
-    from_date: datetime | None = None,
-) -> list[neo4j._data.Record]:
-    """
-    fetch raw issues from data dump in neo4j
-
-    Parameters
-    ------------
-    repository_id : list[int]
-        a list of repository id to fetch their issues
-    from_date : datetime | None
-        get the issues form a specific date that they were created
-        defualt is `None`, meaning to apply no filtering on data
-
-    Returns
-    --------
-    raw_records : list[neo4j._data.Record]
-        list of neo4j records as the extracted issues
-    """
-    neo4j_connection = Neo4jConnection()
-    neo4j_driver = neo4j_connection.connect_neo4j()
-    query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
-        WHERE
-        i.repository_id IN $repoIds
-    """
-    if from_date is not None:
-        query += "AND datetime(i.updated_at) >= datetime($from_date)"
-
-    query += """
-        MATCH (repo:Repository {id: i.repository_id})
-        RETURN
-            user.login as author_name,
-            i.id as id,
-            i.title as title,
-            i.body as text,
-            i.state as state,
-            i.state_reason as state_reason,
-            i.created_at as created_at,
-            i.updated_at as updated_at,
-            i.closed_at as closed_at,
-            i.latestSavedAt as latest_saved_at,
-            i.html_url as url,
-            i.repository_id as repository_id,
-            repo.full_name as repository_name
-        ORDER BY datetime(created_at)
-    """
-
-    def _exec_query(tx, repoIds, from_date):
-        result = tx.run(query, repoIds=repoIds, from_date=from_date)
-        return list(result)
-
-    with neo4j_driver.session() as session:
-        raw_records = session.execute_read(
-            _exec_query,
-            repoIds=repository_id,
-            from_date=from_date,
-        )
-
-    return raw_records
-
-
-def fetch_issues(
-    repository_id: list[int],
-    from_date: datetime | None = None,
-) -> list[GitHubIssue]:
-    """
-    fetch issues from data dump in neo4j
-
-    Parameters
-    ------------
-    repository_id : list[int]
-        a list of repository id to fetch their issues
-    from_date : datetime | None
-        get the issues form a specific date that they were created
-        defualt is `None`, meaning to apply no filtering on data
-
-    Returns
-    --------
-    github_issues : list[GitHubIssue]
-        list of neo4j records as the extracted issues
-    """
-    records = fetch_raw_issues(repository_id, from_date)
-
-    github_issues: list[GitHubIssue] = []
-    for record in records:
-        issue = GitHubIssue.from_dict(record)
-        github_issues.append(issue)
-
-    return github_issues
+from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID
+
+
+class GithubIssueExtraction:
+    def __init__(self):
+        """
+        Initializes the GitHubCommentExtraction class
+        without requiring any parameters.
+        Establishes a connection to the Neo4j database.
+        """
+        self.neo4j_connection = Neo4jConnection()
+        self.neo4j_driver = self.neo4j_connection.connect_neo4j()
+
+    def _fetch_raw_issues(
+        self,
+        repository_id: list[int],
+        from_date: datetime | None = None,
+    ) -> list[neo4j._data.Record]:
+        """
+        fetch raw issues from data dump in neo4j
+
+        Parameters
+        ------------
+        repository_id : list[int]
+            a list of repository id to fetch their issues
+        from_date : datetime | None
+            get the issues form a specific date that they were created
+            default is `None`, meaning to apply no filtering on data
+
+        Returns
+        --------
+        raw_records : list[neo4j._data.Record]
+            list of neo4j records as the extracted issues
+        """
+
+        query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
+            WHERE
+            i.repository_id IN $repoIds
+        """
+        if from_date is not None:
+            query += "AND datetime(i.updated_at) >= datetime($from_date)"
+
+        query += """
+            MATCH (repo:Repository {id: i.repository_id})
+            RETURN
+                user.login as author_name,
+                i.id as id,
+                i.title as title,
+                i.body as text,
+                i.state as state,
+                i.state_reason as state_reason,
+                i.created_at as created_at,
+                i.updated_at as updated_at,
+                i.closed_at as closed_at,
+                i.latestSavedAt as latest_saved_at,
+                i.html_url as url,
+                i.repository_id as repository_id,
+                repo.full_name as repository_name
+            ORDER BY datetime(created_at)
+        """
+
+        def _exec_query(tx, repoIds, from_date):
+            result = tx.run(query, repoIds=repoIds, from_date=from_date)
+            return list(result)
+
+        with self.neo4j_driver.session() as session:
+            raw_records = session.execute_read(
+                _exec_query,
+                repoIds=repository_id,
+                from_date=from_date,
+            )
+
+        return raw_records
+
+    def _fetch_raw_issue_ids(
+        self,
+        repository_id: list[int],
+        from_date: datetime | None = None,
+    ) -> list[neo4j._data.Record]:
+        """
+        fetch raw issues from data dump in neo4j
+
+        Parameters
+        ------------
+        repository_id : list[int]
+            a list of repository id to fetch their issues
+        from_date : datetime | None
+            get the issues form a specific date that they were created
+            default is `None`, meaning to apply no filtering on data
+
+        Returns
+        --------
+        raw_records : list[neo4j._data.Record]
+            list of neo4j records as the extracted issues
+        """
+
+        query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
+            WHERE
+            i.repository_id IN $repoIds
+        """
+        if from_date is not None:
+            query += "AND datetime(i.updated_at) >= datetime($from_date)"
+
+        query += """
+            MATCH (repo:Repository {id: i.repository_id})
+            RETURN
+                i.id as id,
+                i.created_at as created_at,
+                repo.full_name as repository_name
+            ORDER BY datetime(created_at)
+        """
+
+        def _exec_query(tx, repoIds, from_date):
+            result = tx.run(query, repoIds=repoIds, from_date=from_date)
+            return list(result)
+
+        with self.neo4j_driver.session() as session:
+            raw_records = session.execute_read(
+                _exec_query,
+                repoIds=repository_id,
+                from_date=from_date,
+            )
+
+        return raw_records
+
+    def fetch_issues(
+        self,
+        repository_id: list[int],
+        from_date: datetime | None = None,
+    ) -> list[GitHubIssue]:
+        """
+        fetch issues from data dump in neo4j
+
+        Parameters
+        ------------
+        repository_id : list[int]
+            a list of repository id to fetch their issues
+        from_date : datetime | None
+            get the issues form a specific date that they were created
+            default is `None`, meaning to apply no filtering on data
+
+        Returns
+        --------
+        github_issues : list[GitHubIssue]
+            list of neo4j records as the extracted issues
+        """
+        records = self._fetch_raw_issues(repository_id, from_date)
+
+        github_issues: list[GitHubIssue] = []
+        for record in records:
+            issue = GitHubIssue.from_dict(record)
+            github_issues.append(issue)
+
+        return github_issues
+
+    def fetch_issue_ids(
+        self,
+        repository_id: list[int],
+        from_date: datetime | None = None,
+    ) -> list[GitHubIssueID]:
+        """
+        fetch issues from data dump in neo4j
+
+        Parameters
+        ------------
+        repository_id : list[int]
+            a list of repository id to fetch their issues
+        from_date : datetime | None
+            get the issues form a specific date that they were created
+            default is `None`, meaning to apply no filtering on data
+
+        Returns
+        --------
+        github_issues_ids : list[GitHubIssueID]
+            list of neo4j records as the extracted issue ids
+        """
+        records = self._fetch_raw_issue_ids(repository_id, from_date)
+
+        github_issue_ids: list[GitHubIssueID] = []
+        for record in records:
+            issue = GitHubIssueID.from_dict(record)
+            github_issue_ids.append(issue)
+
+        return github_issue_ids
diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py
@@ -1,5 +1,5 @@
 # flake8: noqa
 from .comment import GitHubComment
 from .commit import GitHubCommit
-from .issue import GitHubIssue
+from .issue import GitHubIssue, GitHubIssueID
 from .pull_request import GitHubPullRequest
diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py
@@ -63,3 +63,22 @@ def to_dict(self) -> dict[str, str | int | None]:
             "repository_name": self.repository_name,
             "type": "issue",
         }
+
+
+class GitHubIssueID(GitHubIssue):
+    def __init__(
+        self,
+        id: int,
+    ) -> None:
+        self.id = id
+
+    @classmethod
+    def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID":
+        return cls(
+            id=issue["id"],  # type: ignore
+        )
+
+    def to_dict(self) -> dict[str, str | int | None]:
+        return {
+            "id": self.id,
+        }