diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index cce2cda4..7112228d 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -3,9 +3,9 @@ from dotenv import load_dotenv from hivemind_etl_helpers.src.db.github.extract import ( + GithubExtraction, fetch_comments, fetch_commits, - fetch_issues, fetch_pull_requests, ) from hivemind_etl_helpers.src.db.github.github_organization_repos import ( @@ -61,9 +61,13 @@ def process_github_vectorstore( logging.info(f"{len(repository_ids)} repositories to fetch data from!") # EXTRACT + extractor = GithubExtraction() + github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date) github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date) - github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date) + github_issues = extractor.fetch_issues( + repository_id=repository_ids, from_date=from_date + ) github_prs = fetch_pull_requests( repository_id=repository_ids, from_date_created=from_starting_date, diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py index 88992b69..5ab83746 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py @@ -1,5 +1,6 @@ # flake8: noqa from .comments import fetch_comments from .commit import fetch_commits -from .issues import fetch_issues +from .github_extraction import GithubExtraction +from .issues import GithubIssueExtraction from .pull_requests import fetch_pull_requests diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py new file mode 100644 index 00000000..133bdde5 --- /dev/null +++ b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py @@ -0,0 +1,32 @@ +from datetime import datetime + +import neo4j +from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue + + +class GithubExtraction: + def __init__(self): + # to be uncommented once other pull requests + # regarding `extraction` are ready + # self.commits_extraction = GithubCommitExtraction() + # self.pull_requests_extraction = GithubPullRequestsExtraction() + # self.comment_extraction = GitHubCommentExtraction() + self.issue_extraction = GithubIssueExtraction() + + def _fetch_raw_issues( + self, repository_id: list[int], from_date: datetime | None = None, **kwargs + ) -> list[neo4j._data.Record]: + return self.issue_extraction._fetch_raw_issues( + repository_id, from_date, **kwargs + ) + + def fetch_issues( + self, repository_id: list[int], from_date: datetime | None = None, **kwargs + ) -> list[GitHubIssue]: + return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs) + + def fetch_issue_ids( + self, repository_id: list[int], from_date: datetime | None = None, **kwargs + ) -> list[GitHubIssue]: + return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 8cb7cdda..2a9b2013 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -2,96 +2,136 @@ import neo4j from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.schema import GitHubIssue - - -def fetch_raw_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[neo4j._data.Record]: - """ - fetch raw issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data - - Returns - -------- - raw_records : list[neo4j._data.Record] - list of neo4j records as the extracted issues - """ - neo4j_connection = Neo4jConnection() - neo4j_driver = neo4j_connection.connect_neo4j() - query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) - WHERE - i.repository_id IN $repoIds - """ - if from_date is not None: - query += "AND datetime(i.updated_at) >= datetime($from_date)" - - query += """ - MATCH (repo:Repository {id: i.repository_id}) - RETURN - user.login as author_name, - i.id as id, - i.title as title, - i.body as text, - i.state as state, - i.state_reason as state_reason, - i.created_at as created_at, - i.updated_at as updated_at, - i.closed_at as closed_at, - i.latestSavedAt as latest_saved_at, - i.html_url as url, - i.repository_id as repository_id, - repo.full_name as repository_name - ORDER BY datetime(created_at) - """ - - def _exec_query(tx, repoIds, from_date): - result = tx.run(query, repoIds=repoIds, from_date=from_date) - return list(result) - - with neo4j_driver.session() as session: - raw_records = session.execute_read( - _exec_query, - repoIds=repository_id, - from_date=from_date, - ) - - return raw_records - - -def fetch_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[GitHubIssue]: - """ - fetch issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data - - Returns - -------- - github_issues : list[GitHubIssue] - list of neo4j records as the extracted issues - """ - records = fetch_raw_issues(repository_id, from_date) - - github_issues: list[GitHubIssue] = [] - for record in records: - issue = GitHubIssue.from_dict(record) - github_issues.append(issue) - - return github_issues +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID + + +class GithubIssueExtraction: + def __init__(self): + """ + Initializes the GitHubCommentExtraction class + without requiring any parameters. + Establishes a connection to the Neo4j database. + """ + self.neo4j_connection = Neo4jConnection() + self.neo4j_driver = self.neo4j_connection.connect_neo4j() + + def _fetch_raw_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[neo4j._data.Record]: + """ + fetch raw issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + raw_records : list[neo4j._data.Record] + list of neo4j records as the extracted issues + """ + + query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) + WHERE + i.repository_id IN $repoIds + """ + if from_date is not None: + query += "AND datetime(i.updated_at) >= datetime($from_date)" + + query += """ + MATCH (repo:Repository {id: i.repository_id}) + RETURN + user.login as author_name, + i.id as id, + i.title as title, + i.body as text, + i.state as state, + i.state_reason as state_reason, + i.created_at as created_at, + i.updated_at as updated_at, + i.closed_at as closed_at, + i.latestSavedAt as latest_saved_at, + i.html_url as url, + i.repository_id as repository_id, + repo.full_name as repository_name + ORDER BY datetime(created_at) + """ + + def _exec_query(tx, repoIds, from_date): + result = tx.run(query, repoIds=repoIds, from_date=from_date) + return list(result) + + with self.neo4j_driver.session() as session: + raw_records = session.execute_read( + _exec_query, + repoIds=repository_id, + from_date=from_date, + ) + + return raw_records + + def fetch_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssue]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues : list[GitHubIssue] + list of neo4j records as the extracted issues + """ + records = self._fetch_raw_issues(repository_id, from_date) + + github_issues: list[GitHubIssue] = [] + for record in records: + issue = GitHubIssue.from_dict(record) + github_issues.append(issue) + + return github_issues + + def fetch_issue_ids( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssueID]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues_ids : list[GitHubIssueID] + list of neo4j records as the extracted issue ids + """ + records = self._fetch_raw_issues(repository_id, from_date) + + github_issue_ids: list[GitHubIssueID] = [] + for record in records: + issue = GitHubIssueID.from_dict(record) + github_issue_ids.append(issue) + + return github_issue_ids diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py index 242b62f1..02de132c 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa from .comment import GitHubComment from .commit import GitHubCommit -from .issue import GitHubIssue +from .issue import GitHubIssue, GitHubIssueID from .pull_request import GitHubPullRequest diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py index 495d3605..82dca6fc 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py @@ -63,3 +63,22 @@ def to_dict(self) -> dict[str, str | int | None]: "repository_name": self.repository_name, "type": "issue", } + + +class GitHubIssueID(GitHubIssue): + def __init__( + self, + id: int, + ) -> None: + self.id = id + + @classmethod + def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID": + return cls( + id=issue["id"], # type: ignore + ) + + def to_dict(self) -> dict[str, str | int | None]: + return { + "id": self.id, + } diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py new file mode 100644 index 00000000..ca79a299 --- /dev/null +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py @@ -0,0 +1,208 @@ +from datetime import datetime +from unittest import TestCase + +from github.neo4j_storage.neo4j_connection import Neo4jConnection +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction + + +class TestGithubETLFetchIssueIDs(TestCase): + def setUp(self) -> None: + self.extractor = GithubExtraction() + neo4j_connection = Neo4jConnection() + self.neo4j_driver = neo4j_connection.connect_neo4j() + with self.neo4j_driver.session() as session: + session.execute_write(lambda tx: tx.run("MATCH (n) DETACH DELETE (n)")) + + def test_get_empty_results_no_from_date(self): + repository_ids = [123, 124] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=None + ) + self.assertEqual(issue_ids, []) + + def test_get_empty_results(self): + repository_ids = [123, 124] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=datetime(2024, 1, 1) + ) + self.assertEqual(issue_ids, []) + + def test_get_single_issue_id_single_repo(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, + ) + + self.assertEqual(len(issue_ids), 1) + self.assertEqual(issue_ids[0].id, 21200001) + + def test_get_multiple_issue_ids_single_repo(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (i2:Issue)<-[:CREATED]-(:GitHubUser {login: "author #2"}) + SET + i2.state_reason = "completed", + i2.body = "explanation of some sample issue 2", + i2.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i2.closed_at = "2024-02-10T12:56:05Z", + i2.comments = 0, + i2.created_at = "2024-02-09T10:23:50Z", + i2.title = "some sample title 2", + i2.url = "https://api.github.com/repos/GitHub/some_repo/issues/2", + i2.author_association = "CONTRIBUTOR", + i2.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/labels{/name}", + i2.number = 1, + i2.updated_at = "2024-02-09T12:56:05Z", + i2.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/events", + i2.html_url = "https://github.com/GitHub/some_repo/issues/2", + i2.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/comments", + i2.repository_id = 123, + i2.id = 21200002, + i2.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i2.state = "closed", + i2.locked = false, + i2.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/timeline", + i2.node_id = "some_id2" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, + ) + + self.assertEqual(len(issue_ids), 2) + self.assertEqual(issue_ids[0].id, 21200001) + self.assertEqual(issue_ids[1].id, 21200002) + + def test_get_multiple_issue_ids_single_repo_with_filtering(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (i2:Issue)<-[:CREATED]-(:GitHubUser {login: "author #2"}) + SET + i2.state_reason = "completed", + i2.body = "explanation of some sample issue 2", + i2.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i2.closed_at = "2024-02-10T12:56:05Z", + i2.comments = 0, + i2.created_at = "2024-02-09T10:23:50Z", + i2.title = "some sample title 2", + i2.url = "https://api.github.com/repos/GitHub/some_repo/issues/2", + i2.author_association = "CONTRIBUTOR", + i2.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/labels{/name}", + i2.number = 1, + i2.updated_at = "2024-02-09T12:56:05Z", + i2.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/events", + i2.html_url = "https://github.com/GitHub/some_repo/issues/2", + i2.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/comments", + i2.repository_id = 123, + i2.id = 21200002, + i2.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i2.state = "closed", + i2.locked = false, + i2.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/timeline", + i2.node_id = "some_id2" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=datetime(2024, 2, 8) + ) + + self.assertEqual(len(issue_ids), 1) + self.assertEqual(issue_ids[0].id, 21200002) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py index 0a6f8052..34316c0d 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py @@ -2,11 +2,12 @@ from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.extract import fetch_issues +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction class TestGithubETLFetchIssues(TestCase): def setUp(self) -> None: + self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() with self.neo4j_driver.session() as session: @@ -14,12 +15,14 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = fetch_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor.fetch_issues( + repository_id=repository_ids, from_date=None + ) self.assertEqual(issues, []) def test_get_empty_results(self): repository_ids = [123, 124] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) self.assertEqual(issues, []) @@ -60,7 +63,7 @@ def test_get_single_issue_single_repo(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, ) @@ -138,7 +141,7 @@ def test_get_multiple_issues_single_repo(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, ) @@ -230,7 +233,7 @@ def test_get_multiple_issues_single_repo_with_filtering(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, from_date=datetime(2024, 2, 8) ) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index 999b8b7a..473dcf6b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -2,11 +2,12 @@ from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.extract.issues import fetch_raw_issues +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction class TestGithubETLFetchRawIssues(TestCase): def setUp(self) -> None: + self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() with self.neo4j_driver.session() as session: @@ -14,12 +15,14 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = fetch_raw_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor._fetch_raw_issues( + repository_id=repository_ids, from_date=None + ) self.assertEqual(issues, []) def test_get_empty_results(self): repository_ids = [123, 124] - issues = fetch_raw_issues( + issues = self.extractor._fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) self.assertEqual(issues, []) @@ -60,7 +63,7 @@ def test_get_single_issue_single_repo(self): ) repository_ids = [123] - issues = fetch_raw_issues( + issues = self.extractor._fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) )