From be0fb2d4c2f7f1b8af6ad082e5536b6576c8251a Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 29 Mar 2024 14:00:06 +0100 Subject: [PATCH 01/15] Functional to OOP refactor for fetch_issues method --- dags/hivemind_etl_helpers/github_etl.py | 5 +- .../src/db/github/extract/__init__.py | 7 +- .../src/db/github/extract/issues.py | 165 +++++++++--------- .../test_github_etl_fetch_issues.py | 16 +- 4 files changed, 106 insertions(+), 87 deletions(-) diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index cce2cda4..6649bac9 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -7,6 +7,7 @@ fetch_commits, fetch_issues, fetch_pull_requests, + GithubExtraction, ) from hivemind_etl_helpers.src.db.github.github_organization_repos import ( get_github_organization_repos, @@ -61,9 +62,11 @@ def process_github_vectorstore( logging.info(f"{len(repository_ids)} repositories to fetch data from!") # EXTRACT + extractor = GithubExtraction() + github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date) github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date) - github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date) + github_issues = extractor.fetch_issues(repository_id=repository_ids, from_date=from_date) github_prs = fetch_pull_requests( repository_id=repository_ids, from_date_created=from_starting_date, diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py index 88992b69..c62af9b6 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py @@ -1,5 +1,10 @@ # flake8: noqa from .comments import fetch_comments from .commit import fetch_commits -from .issues import fetch_issues +from .issues import GithubIssueExtraction from .pull_requests import fetch_pull_requests + +class GithubExtraction(GithubIssueExtraction): + + def __init__(self): + pass \ No newline at end of file diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 8cb7cdda..8400ac22 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -4,94 +4,101 @@ from github.neo4j_storage.neo4j_connection import Neo4jConnection from hivemind_etl_helpers.src.db.github.schema import GitHubIssue +class GithubIssueExtraction(): + + def __init__(self): + pass + + def __fetch_raw_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[neo4j._data.Record]: + """ + fetch raw issues from data dump in neo4j -def fetch_raw_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[neo4j._data.Record]: - """ - fetch raw issues from data dump in neo4j + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + defualt is `None`, meaning to apply no filtering on data - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data + Returns + -------- + raw_records : list[neo4j._data.Record] + list of neo4j records as the extracted issues + """ + neo4j_connection = Neo4jConnection() + neo4j_driver = neo4j_connection.connect_neo4j() + query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) + WHERE + i.repository_id IN $repoIds + """ + if from_date is not None: + query += "AND datetime(i.updated_at) >= datetime($from_date)" - Returns - -------- - raw_records : list[neo4j._data.Record] - list of neo4j records as the extracted issues - """ - neo4j_connection = Neo4jConnection() - neo4j_driver = neo4j_connection.connect_neo4j() - query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) - WHERE - i.repository_id IN $repoIds - """ - if from_date is not None: - query += "AND datetime(i.updated_at) >= datetime($from_date)" + query += """ + MATCH (repo:Repository {id: i.repository_id}) + RETURN + user.login as author_name, + i.id as id, + i.title as title, + i.body as text, + i.state as state, + i.state_reason as state_reason, + i.created_at as created_at, + i.updated_at as updated_at, + i.closed_at as closed_at, + i.latestSavedAt as latest_saved_at, + i.html_url as url, + i.repository_id as repository_id, + repo.full_name as repository_name + ORDER BY datetime(created_at) + """ - query += """ - MATCH (repo:Repository {id: i.repository_id}) - RETURN - user.login as author_name, - i.id as id, - i.title as title, - i.body as text, - i.state as state, - i.state_reason as state_reason, - i.created_at as created_at, - i.updated_at as updated_at, - i.closed_at as closed_at, - i.latestSavedAt as latest_saved_at, - i.html_url as url, - i.repository_id as repository_id, - repo.full_name as repository_name - ORDER BY datetime(created_at) - """ + def _exec_query(tx, repoIds, from_date): + result = tx.run(query, repoIds=repoIds, from_date=from_date) + return list(result) - def _exec_query(tx, repoIds, from_date): - result = tx.run(query, repoIds=repoIds, from_date=from_date) - return list(result) + with neo4j_driver.session() as session: + raw_records = session.execute_read( + _exec_query, + repoIds=repository_id, + from_date=from_date, + ) - with neo4j_driver.session() as session: - raw_records = session.execute_read( - _exec_query, - repoIds=repository_id, - from_date=from_date, - ) + return raw_records + + def fetch_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssue]: + """ + fetch issues from data dump in neo4j - return raw_records + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + defualt is `None`, meaning to apply no filtering on data + Returns + -------- + github_issues : list[GitHubIssue] + list of neo4j records as the extracted issues + """ + records = self.__fetch_raw_issues(repository_id, from_date) -def fetch_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[GitHubIssue]: - """ - fetch issues from data dump in neo4j + github_issues: list[GitHubIssue] = [] + for record in records: + issue = GitHubIssue.from_dict(record) + github_issues.append(issue) - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data + return github_issues - Returns - -------- - github_issues : list[GitHubIssue] - list of neo4j records as the extracted issues - """ - records = fetch_raw_issues(repository_id, from_date) - github_issues: list[GitHubIssue] = [] - for record in records: - issue = GitHubIssue.from_dict(record) - github_issues.append(issue) - - return github_issues diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py index 0a6f8052..a090f974 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py @@ -2,10 +2,14 @@ from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.extract import fetch_issues +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction class TestGithubETLFetchIssues(TestCase): + + def __init__(self): + self.extractor = GithubExtraction() + def setUp(self) -> None: neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() @@ -14,12 +18,12 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = fetch_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor.fetch_issues(repository_id=repository_ids, from_date=None) self.assertEqual(issues, []) def test_get_empty_results(self): repository_ids = [123, 124] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) self.assertEqual(issues, []) @@ -60,7 +64,7 @@ def test_get_single_issue_single_repo(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, ) @@ -138,7 +142,7 @@ def test_get_multiple_issues_single_repo(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, ) @@ -230,7 +234,7 @@ def test_get_multiple_issues_single_repo_with_filtering(self): ) repository_ids = [123] - issues = fetch_issues( + issues = self.extractor.fetch_issues( repository_id=repository_ids, from_date=datetime(2024, 2, 8) ) From fe6884d4cc242db6e0c75e65d6b10c7fcd679fcf Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 29 Mar 2024 15:10:15 +0100 Subject: [PATCH 02/15] retest functional to OOP refactor, fetch_issues method --- .../src/db/github/extract/issues.py | 1 + .../integration/test_github_etl_fetch_raw_issues.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 8400ac22..e42e751b 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -100,5 +100,6 @@ def fetch_issues( github_issues.append(issue) return github_issues + diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index 999b8b7a..fe1a1b18 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -2,10 +2,14 @@ from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.extract.issues import fetch_raw_issues +from hivemind_etl_helpers.src.db.github.extract.issues import GithubExtraction class TestGithubETLFetchRawIssues(TestCase): + + def __init__(self): + self.extractor = GithubExtraction() + def setUp(self) -> None: neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() @@ -14,12 +18,12 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = fetch_raw_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor._GithubIssueExtraction__fetch_raw_issues(repository_id=repository_ids, from_date=None) self.assertEqual(issues, []) def test_get_empty_results(self): repository_ids = [123, 124] - issues = fetch_raw_issues( + issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) self.assertEqual(issues, []) @@ -60,7 +64,7 @@ def test_get_single_issue_single_repo(self): ) repository_ids = [123] - issues = fetch_raw_issues( + issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) From 9ee47479253daf887f85f481f67caf6108baf479 Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 29 Mar 2024 15:16:42 +0100 Subject: [PATCH 03/15] test functional to OOP refactor, fetch_issues method --- .../tests/integration/test_github_etl_fetch_raw_issues.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index fe1a1b18..64014321 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -2,7 +2,7 @@ from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.extract.issues import GithubExtraction +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction class TestGithubETLFetchRawIssues(TestCase): From 6b12419b1b32b445a617f0c1a1f1d72abba2d84a Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 29 Mar 2024 15:37:42 +0100 Subject: [PATCH 04/15] test functional to OOP refactor, fetch_issues method --- .../tests/integration/test_github_etl_fetch_issues.py | 4 +--- .../tests/integration/test_github_etl_fetch_raw_issues.py | 6 ++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py index a090f974..3ae25d81 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py @@ -7,10 +7,8 @@ class TestGithubETLFetchIssues(TestCase): - def __init__(self): - self.extractor = GithubExtraction() - def setUp(self) -> None: + self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() with self.neo4j_driver.session() as session: diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index 64014321..4d8fe83f 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -6,11 +6,9 @@ class TestGithubETLFetchRawIssues(TestCase): - - def __init__(self): - self.extractor = GithubExtraction() - + def setUp(self) -> None: + self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() self.neo4j_driver = neo4j_connection.connect_neo4j() with self.neo4j_driver.session() as session: From 549699bbca823021058c6eeeba514f75bd700050 Mon Sep 17 00:00:00 2001 From: Jacob Date: Sun, 31 Mar 2024 15:57:00 +0200 Subject: [PATCH 05/15] lint code --- dags/hivemind_etl_helpers/github_etl.py | 10 ++++++---- .../src/db/github/extract/__init__.py | 5 ++--- .../src/db/github/extract/issues.py | 11 ++++------- .../tests/integration/test_github_etl_fetch_issues.py | 5 +++-- .../integration/test_github_etl_fetch_raw_issues.py | 5 +++-- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index 6649bac9..3471a05e 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -3,11 +3,11 @@ from dotenv import load_dotenv from hivemind_etl_helpers.src.db.github.extract import ( + GithubExtraction, fetch_comments, fetch_commits, - fetch_issues, + # fetch_issues, fetch_pull_requests, - GithubExtraction, ) from hivemind_etl_helpers.src.db.github.github_organization_repos import ( get_github_organization_repos, @@ -63,10 +63,12 @@ def process_github_vectorstore( # EXTRACT extractor = GithubExtraction() - + github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date) github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date) - github_issues = extractor.fetch_issues(repository_id=repository_ids, from_date=from_date) + github_issues = extractor.fetch_issues( + repository_id=repository_ids, from_date=from_date + ) github_prs = fetch_pull_requests( repository_id=repository_ids, from_date_created=from_starting_date, diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py index c62af9b6..87a1413d 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py @@ -4,7 +4,6 @@ from .issues import GithubIssueExtraction from .pull_requests import fetch_pull_requests + class GithubExtraction(GithubIssueExtraction): - - def __init__(self): - pass \ No newline at end of file + pass diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index e42e751b..e4dc0b39 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -4,11 +4,11 @@ from github.neo4j_storage.neo4j_connection import Neo4jConnection from hivemind_etl_helpers.src.db.github.schema import GitHubIssue -class GithubIssueExtraction(): - + +class GithubIssueExtraction: def __init__(self): pass - + def __fetch_raw_issues( self, repository_id: list[int], @@ -70,7 +70,7 @@ def _exec_query(tx, repoIds, from_date): ) return raw_records - + def fetch_issues( self, repository_id: list[int], @@ -100,6 +100,3 @@ def fetch_issues( github_issues.append(issue) return github_issues - - - diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py index 3ae25d81..34316c0d 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issues.py @@ -6,7 +6,6 @@ class TestGithubETLFetchIssues(TestCase): - def setUp(self) -> None: self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() @@ -16,7 +15,9 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = self.extractor.fetch_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor.fetch_issues( + repository_id=repository_ids, from_date=None + ) self.assertEqual(issues, []) def test_get_empty_results(self): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index 4d8fe83f..3fc0b1da 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -6,7 +6,6 @@ class TestGithubETLFetchRawIssues(TestCase): - def setUp(self) -> None: self.extractor = GithubExtraction() neo4j_connection = Neo4jConnection() @@ -16,7 +15,9 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = self.extractor._GithubIssueExtraction__fetch_raw_issues(repository_id=repository_ids, from_date=None) + issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( + repository_id=repository_ids, from_date=None + ) self.assertEqual(issues, []) def test_get_empty_results(self): From 3278cd5b8e4485ea39a0acc0d07dda068a5b91de Mon Sep 17 00:00:00 2001 From: Jacob Date: Sun, 31 Mar 2024 16:18:30 +0200 Subject: [PATCH 06/15] lint code --- dags/hivemind_etl_helpers/github_etl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index 3471a05e..11391679 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -2,11 +2,10 @@ from datetime import datetime from dotenv import load_dotenv -from hivemind_etl_helpers.src.db.github.extract import ( +from hivemind_etl_helpers.src.db.github.extract import ( # fetch_issues, GithubExtraction, fetch_comments, fetch_commits, - # fetch_issues, fetch_pull_requests, ) from hivemind_etl_helpers.src.db.github.github_organization_repos import ( From 4d2f753f2172675e1d20ec2ff05572d32290e1e4 Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 12 Apr 2024 11:05:05 +0200 Subject: [PATCH 07/15] Draft commit, Bounty 2 --- .../src/db/github/extract/issues.py | 36 +++++++++++++++++-- .../src/db/github/schema/__init__.py | 2 +- .../src/db/github/schema/issue.py | 18 ++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index e4dc0b39..b9f0d6b6 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -2,7 +2,7 @@ import neo4j from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.schema import GitHubIssue +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID class GithubIssueExtraction: @@ -23,7 +23,7 @@ def __fetch_raw_issues( a list of repository id to fetch their issues from_date : datetime | None get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data + default is `None`, meaning to apply no filtering on data Returns -------- @@ -85,7 +85,7 @@ def fetch_issues( a list of repository id to fetch their issues from_date : datetime | None get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data + default is `None`, meaning to apply no filtering on data Returns -------- @@ -100,3 +100,33 @@ def fetch_issues( github_issues.append(issue) return github_issues + + def fetch_issue_ids( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssueID]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues_ids : list[GitHubIssueID] + list of neo4j records as the extracted issue ids + """ + records = self.__fetch_raw_issues(repository_id, from_date) + + github_issue_ids: list[GitHubIssueID] = [] + for record in records: + issue = GitHubIssueID.from_dict(record) + github_issues_ids.append(issue) + + return github_issue_ids diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py index 242b62f1..02de132c 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa from .comment import GitHubComment from .commit import GitHubCommit -from .issue import GitHubIssue +from .issue import GitHubIssue, GitHubIssueID from .pull_request import GitHubPullRequest diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py index 495d3605..88e69a7c 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py @@ -63,3 +63,21 @@ def to_dict(self) -> dict[str, str | int | None]: "repository_name": self.repository_name, "type": "issue", } + +class GitHubIssueID: + def __init__( + self, + id: int, + ) -> None: + self.id = id + + @classmethod + def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID": + return cls( + id=issue["id"], # type: ignore + ) + + def to_dict(self) -> dict[str, str | int | None]: + return { + "id": self.id, + } From ef4da2080cb98feb8e5c122021aa366b895d06d1 Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 12 Apr 2024 11:05:19 +0200 Subject: [PATCH 08/15] Draft commit, Bounty 2 --- .../test_github_etl_fetch_issue_ids.py | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py new file mode 100644 index 00000000..ec7d384e --- /dev/null +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py @@ -0,0 +1,209 @@ +from datetime import datetime +from unittest import TestCase + +from pathlib import Path + +from github.neo4j_storage.neo4j_connection import Neo4jConnection +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction + +class TestGithubETLFetchIssueIDs(TestCase): + def setUp(self) -> None: + self.extractor = GithubExtraction() + neo4j_connection = Neo4jConnection() + self.neo4j_driver = neo4j_connection.connect_neo4j() + with self.neo4j_driver.session() as session: + session.execute_write(lambda tx: tx.run("MATCH (n) DETACH DELETE (n)")) + + def test_get_empty_results_no_from_date(self): + repository_ids = [123, 124] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=None + ) + self.assertEqual(issue_ids, []) + + def test_get_empty_results(self): + repository_ids = [123, 124] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=datetime(2024, 1, 1) + ) + self.assertEqual(issue_ids, []) + + def test_get_single_issue_id_single_repo(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, + ) + + self.assertEqual(len(issue_ids), 1) + self.assertEqual(issue_ids[0].id, 21200001) + + def test_get_multiple_issue_ids_single_repo(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (i2:Issue)<-[:CREATED]-(:GitHubUser {login: "author #2"}) + SET + i2.state_reason = "completed", + i2.body = "explanation of some sample issue 2", + i2.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i2.closed_at = "2024-02-10T12:56:05Z", + i2.comments = 0, + i2.created_at = "2024-02-09T10:23:50Z", + i2.title = "some sample title 2", + i2.url = "https://api.github.com/repos/GitHub/some_repo/issues/2", + i2.author_association = "CONTRIBUTOR", + i2.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/labels{/name}", + i2.number = 1, + i2.updated_at = "2024-02-09T12:56:05Z", + i2.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/events", + i2.html_url = "https://github.com/GitHub/some_repo/issues/2", + i2.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/comments", + i2.repository_id = 123, + i2.id = 21200002, + i2.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i2.state = "closed", + i2.locked = false, + i2.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/timeline", + i2.node_id = "some_id2" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, + ) + + self.assertEqual(len(issue_ids), 2) + self.assertEqual(issue_ids[0].id, 21200001) + self.assertEqual(issue_ids[1].id, 21200002) + + def test_get_multiple_issue_ids_single_repo_with_filtering(self): + with self.neo4j_driver.session() as session: + session.execute_write( + lambda tx: tx.run( + """ + CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"}) + SET + i.state_reason = "completed", + i.body = "explanation of some sample issue", + i.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i.closed_at = "2024-02-06T12:56:05Z", + i.comments = 0, + i.created_at = "2024-02-06T10:23:50Z", + i.title = "some sample title", + i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1", + i.author_association = "CONTRIBUTOR", + i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}", + i.number = 1, + i.updated_at = "2024-02-06T12:56:05Z", + i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events", + i.html_url = "https://github.com/GitHub/some_repo/issues/1", + i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments", + i.repository_id = 123, + i.id = 21200001, + i.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i.state = "closed", + i.locked = false, + i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline", + i.node_id = "some_id" + + CREATE (i2:Issue)<-[:CREATED]-(:GitHubUser {login: "author #2"}) + SET + i2.state_reason = "completed", + i2.body = "explanation of some sample issue 2", + i2.latestSavedAt = "2024-02-15T06:10:02.262000000Z", + i2.closed_at = "2024-02-10T12:56:05Z", + i2.comments = 0, + i2.created_at = "2024-02-09T10:23:50Z", + i2.title = "some sample title 2", + i2.url = "https://api.github.com/repos/GitHub/some_repo/issues/2", + i2.author_association = "CONTRIBUTOR", + i2.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/labels{/name}", + i2.number = 1, + i2.updated_at = "2024-02-09T12:56:05Z", + i2.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/events", + i2.html_url = "https://github.com/GitHub/some_repo/issues/2", + i2.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/comments", + i2.repository_id = 123, + i2.id = 21200002, + i2.repository_url = "https://api.github.com/repos/GitHub/some_repo", + i2.state = "closed", + i2.locked = false, + i2.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/2/timeline", + i2.node_id = "some_id2" + + CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"}) + """ + ) + ) + + repository_ids = [123] + issue_ids = self.extractor.fetch_issue_ids( + repository_id=repository_ids, from_date=datetime(2024, 2, 8) + ) + + self.assertEqual(len(issue_ids), 1) + self.assertEqual(issue_ids[0].id, 21200002) From db159ccead886c2c3310926a077bd917e8b3359a Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 12 Apr 2024 11:18:43 +0200 Subject: [PATCH 09/15] Draft commit, Bounty 2, bug fix --- dags/hivemind_etl_helpers/src/db/github/extract/issues.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index b9f0d6b6..4fbd1233 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -127,6 +127,6 @@ def fetch_issue_ids( github_issue_ids: list[GitHubIssueID] = [] for record in records: issue = GitHubIssueID.from_dict(record) - github_issues_ids.append(issue) + github_issue_ids.append(issue) return github_issue_ids From 4d6e3f6047ab739638be8624d31506ff8afa57e6 Mon Sep 17 00:00:00 2001 From: Jacob Date: Fri, 12 Apr 2024 18:21:29 +0200 Subject: [PATCH 10/15] Draft commit, Bounty 2, Linted --- .../src/db/github/extract/issues.py | 221 ++++++++---------- .../src/db/github/schema/__init__.py | 2 +- .../src/db/github/schema/issue.py | 18 -- 3 files changed, 94 insertions(+), 147 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 4fbd1233..8cb7cdda 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -2,131 +2,96 @@ import neo4j from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID - - -class GithubIssueExtraction: - def __init__(self): - pass - - def __fetch_raw_issues( - self, - repository_id: list[int], - from_date: datetime | None = None, - ) -> list[neo4j._data.Record]: - """ - fetch raw issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - default is `None`, meaning to apply no filtering on data - - Returns - -------- - raw_records : list[neo4j._data.Record] - list of neo4j records as the extracted issues - """ - neo4j_connection = Neo4jConnection() - neo4j_driver = neo4j_connection.connect_neo4j() - query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) - WHERE - i.repository_id IN $repoIds - """ - if from_date is not None: - query += "AND datetime(i.updated_at) >= datetime($from_date)" - - query += """ - MATCH (repo:Repository {id: i.repository_id}) - RETURN - user.login as author_name, - i.id as id, - i.title as title, - i.body as text, - i.state as state, - i.state_reason as state_reason, - i.created_at as created_at, - i.updated_at as updated_at, - i.closed_at as closed_at, - i.latestSavedAt as latest_saved_at, - i.html_url as url, - i.repository_id as repository_id, - repo.full_name as repository_name - ORDER BY datetime(created_at) - """ - - def _exec_query(tx, repoIds, from_date): - result = tx.run(query, repoIds=repoIds, from_date=from_date) - return list(result) - - with neo4j_driver.session() as session: - raw_records = session.execute_read( - _exec_query, - repoIds=repository_id, - from_date=from_date, - ) - - return raw_records - - def fetch_issues( - self, - repository_id: list[int], - from_date: datetime | None = None, - ) -> list[GitHubIssue]: - """ - fetch issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - default is `None`, meaning to apply no filtering on data - - Returns - -------- - github_issues : list[GitHubIssue] - list of neo4j records as the extracted issues - """ - records = self.__fetch_raw_issues(repository_id, from_date) - - github_issues: list[GitHubIssue] = [] - for record in records: - issue = GitHubIssue.from_dict(record) - github_issues.append(issue) - - return github_issues - - def fetch_issue_ids( - self, - repository_id: list[int], - from_date: datetime | None = None, - ) -> list[GitHubIssueID]: - """ - fetch issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - default is `None`, meaning to apply no filtering on data - - Returns - -------- - github_issues_ids : list[GitHubIssueID] - list of neo4j records as the extracted issue ids - """ - records = self.__fetch_raw_issues(repository_id, from_date) - - github_issue_ids: list[GitHubIssueID] = [] - for record in records: - issue = GitHubIssueID.from_dict(record) - github_issue_ids.append(issue) - - return github_issue_ids +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue + + +def fetch_raw_issues( + repository_id: list[int], + from_date: datetime | None = None, +) -> list[neo4j._data.Record]: + """ + fetch raw issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + defualt is `None`, meaning to apply no filtering on data + + Returns + -------- + raw_records : list[neo4j._data.Record] + list of neo4j records as the extracted issues + """ + neo4j_connection = Neo4jConnection() + neo4j_driver = neo4j_connection.connect_neo4j() + query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) + WHERE + i.repository_id IN $repoIds + """ + if from_date is not None: + query += "AND datetime(i.updated_at) >= datetime($from_date)" + + query += """ + MATCH (repo:Repository {id: i.repository_id}) + RETURN + user.login as author_name, + i.id as id, + i.title as title, + i.body as text, + i.state as state, + i.state_reason as state_reason, + i.created_at as created_at, + i.updated_at as updated_at, + i.closed_at as closed_at, + i.latestSavedAt as latest_saved_at, + i.html_url as url, + i.repository_id as repository_id, + repo.full_name as repository_name + ORDER BY datetime(created_at) + """ + + def _exec_query(tx, repoIds, from_date): + result = tx.run(query, repoIds=repoIds, from_date=from_date) + return list(result) + + with neo4j_driver.session() as session: + raw_records = session.execute_read( + _exec_query, + repoIds=repository_id, + from_date=from_date, + ) + + return raw_records + + +def fetch_issues( + repository_id: list[int], + from_date: datetime | None = None, +) -> list[GitHubIssue]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + defualt is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues : list[GitHubIssue] + list of neo4j records as the extracted issues + """ + records = fetch_raw_issues(repository_id, from_date) + + github_issues: list[GitHubIssue] = [] + for record in records: + issue = GitHubIssue.from_dict(record) + github_issues.append(issue) + + return github_issues diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py index 02de132c..242b62f1 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa from .comment import GitHubComment from .commit import GitHubCommit -from .issue import GitHubIssue, GitHubIssueID +from .issue import GitHubIssue from .pull_request import GitHubPullRequest diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py index 88e69a7c..495d3605 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py @@ -63,21 +63,3 @@ def to_dict(self) -> dict[str, str | int | None]: "repository_name": self.repository_name, "type": "issue", } - -class GitHubIssueID: - def __init__( - self, - id: int, - ) -> None: - self.id = id - - @classmethod - def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID": - return cls( - id=issue["id"], # type: ignore - ) - - def to_dict(self) -> dict[str, str | int | None]: - return { - "id": self.id, - } From 1cbbf696f5cea3c33b017e311985ff6d809b6644 Mon Sep 17 00:00:00 2001 From: Jacob Date: Tue, 16 Apr 2024 00:44:19 +0200 Subject: [PATCH 11/15] Bounty 2 commit. Tested. Linted. --- .../src/db/github/extract/issues.py | 221 ++++++++++-------- .../src/db/github/schema/__init__.py | 2 +- .../src/db/github/schema/issue.py | 19 ++ .../test_github_etl_fetch_issue_ids.py | 4 +- 4 files changed, 150 insertions(+), 96 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 8cb7cdda..984f69e0 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -2,96 +2,131 @@ import neo4j from github.neo4j_storage.neo4j_connection import Neo4jConnection -from hivemind_etl_helpers.src.db.github.schema import GitHubIssue - - -def fetch_raw_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[neo4j._data.Record]: - """ - fetch raw issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data - - Returns - -------- - raw_records : list[neo4j._data.Record] - list of neo4j records as the extracted issues - """ - neo4j_connection = Neo4jConnection() - neo4j_driver = neo4j_connection.connect_neo4j() - query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) - WHERE - i.repository_id IN $repoIds - """ - if from_date is not None: - query += "AND datetime(i.updated_at) >= datetime($from_date)" - - query += """ - MATCH (repo:Repository {id: i.repository_id}) - RETURN - user.login as author_name, - i.id as id, - i.title as title, - i.body as text, - i.state as state, - i.state_reason as state_reason, - i.created_at as created_at, - i.updated_at as updated_at, - i.closed_at as closed_at, - i.latestSavedAt as latest_saved_at, - i.html_url as url, - i.repository_id as repository_id, - repo.full_name as repository_name - ORDER BY datetime(created_at) - """ - - def _exec_query(tx, repoIds, from_date): - result = tx.run(query, repoIds=repoIds, from_date=from_date) - return list(result) - - with neo4j_driver.session() as session: - raw_records = session.execute_read( - _exec_query, - repoIds=repository_id, - from_date=from_date, - ) - - return raw_records - - -def fetch_issues( - repository_id: list[int], - from_date: datetime | None = None, -) -> list[GitHubIssue]: - """ - fetch issues from data dump in neo4j - - Parameters - ------------ - repository_id : list[int] - a list of repository id to fetch their issues - from_date : datetime | None - get the issues form a specific date that they were created - defualt is `None`, meaning to apply no filtering on data - - Returns - -------- - github_issues : list[GitHubIssue] - list of neo4j records as the extracted issues - """ - records = fetch_raw_issues(repository_id, from_date) - - github_issues: list[GitHubIssue] = [] - for record in records: - issue = GitHubIssue.from_dict(record) - github_issues.append(issue) - - return github_issues +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID + + +class GithubIssueExtraction: + def __init__(self): + pass + + def __fetch_raw_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[neo4j._data.Record]: + """ + fetch raw issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + raw_records : list[neo4j._data.Record] + list of neo4j records as the extracted issues + """ + neo4j_connection = Neo4jConnection() + neo4j_driver = neo4j_connection.connect_neo4j() + query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) + WHERE + i.repository_id IN $repoIds + """ + if from_date is not None: + query += "AND datetime(i.updated_at) >= datetime($from_date)" + + query += """ + MATCH (repo:Repository {id: i.repository_id}) + RETURN + user.login as author_name, + i.id as id, + i.title as title, + i.body as text, + i.state as state, + i.state_reason as state_reason, + i.created_at as created_at, + i.updated_at as updated_at, + i.closed_at as closed_at, + i.latestSavedAt as latest_saved_at, + i.html_url as url, + i.repository_id as repository_id, + repo.full_name as repository_name + ORDER BY datetime(created_at) + """ + + def _exec_query(tx, repoIds, from_date): + result = tx.run(query, repoIds=repoIds, from_date=from_date) + return list(result) + + with neo4j_driver.session() as session: + raw_records = session.execute_read( + _exec_query, + repoIds=repository_id, + from_date=from_date, + ) + + return raw_records + + def fetch_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssue]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues : list[GitHubIssue] + list of neo4j records as the extracted issues + """ + records = self.__fetch_raw_issues(repository_id, from_date) + + github_issues: list[GitHubIssue] = [] + for record in records: + issue = GitHubIssue.from_dict(record) + github_issues.append(issue) + + return github_issues + + def fetch_issue_ids( + self, + repository_id: list[int], + from_date: datetime | None = None, + ) -> list[GitHubIssueID]: + """ + fetch issues from data dump in neo4j + + Parameters + ------------ + repository_id : list[int] + a list of repository id to fetch their issues + from_date : datetime | None + get the issues form a specific date that they were created + default is `None`, meaning to apply no filtering on data + + Returns + -------- + github_issues_ids : list[GitHubIssueID] + list of neo4j records as the extracted issue ids + """ + records = self.__fetch_raw_issues(repository_id, from_date) + + github_issue_ids: list[GitHubIssueID] = [] + for record in records: + issue = GitHubIssueID.from_dict(record) + github_issue_ids.append(issue) + + return github_issue_ids diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py index 242b62f1..02de132c 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa from .comment import GitHubComment from .commit import GitHubCommit -from .issue import GitHubIssue +from .issue import GitHubIssue, GitHubIssueID from .pull_request import GitHubPullRequest diff --git a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py index 495d3605..82dca6fc 100644 --- a/dags/hivemind_etl_helpers/src/db/github/schema/issue.py +++ b/dags/hivemind_etl_helpers/src/db/github/schema/issue.py @@ -63,3 +63,22 @@ def to_dict(self) -> dict[str, str | int | None]: "repository_name": self.repository_name, "type": "issue", } + + +class GitHubIssueID(GitHubIssue): + def __init__( + self, + id: int, + ) -> None: + self.id = id + + @classmethod + def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID": + return cls( + id=issue["id"], # type: ignore + ) + + def to_dict(self) -> dict[str, str | int | None]: + return { + "id": self.id, + } diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py index ec7d384e..42740337 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py @@ -1,11 +1,11 @@ from datetime import datetime -from unittest import TestCase - from pathlib import Path +from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection from hivemind_etl_helpers.src.db.github.extract import GithubExtraction + class TestGithubETLFetchIssueIDs(TestCase): def setUp(self) -> None: self.extractor = GithubExtraction() From 06fa34e12001d76df047472057b39a7aa4a450b8 Mon Sep 17 00:00:00 2001 From: Jacob Date: Tue, 16 Apr 2024 01:27:00 +0200 Subject: [PATCH 12/15] Bounty 2 commit. Tested. Linted. --- .../tests/integration/test_github_etl_fetch_issue_ids.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py index 42740337..ca79a299 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_issue_ids.py @@ -1,5 +1,4 @@ from datetime import datetime -from pathlib import Path from unittest import TestCase from github.neo4j_storage.neo4j_connection import Neo4jConnection From 01fed3d7fbcd7407d8ed0ff1e6227619939f2be3 Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 18 Apr 2024 05:49:18 +0200 Subject: [PATCH 13/15] Bounty 1 (&2) recommended fixes. --- dags/hivemind_etl_helpers/github_etl.py | 2 +- .../src/db/github/extract/__init__.py | 4 +--- .../db/github/extract/github_extraction.py | 23 +++++++++++++++++++ .../src/db/github/extract/issues.py | 19 +++++++++------ .../test_github_etl_fetch_raw_issues.py | 6 ++--- 5 files changed, 40 insertions(+), 14 deletions(-) create mode 100644 dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index 11391679..7112228d 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -2,7 +2,7 @@ from datetime import datetime from dotenv import load_dotenv -from hivemind_etl_helpers.src.db.github.extract import ( # fetch_issues, +from hivemind_etl_helpers.src.db.github.extract import ( GithubExtraction, fetch_comments, fetch_commits, diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py index 87a1413d..0c5368e0 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py @@ -4,6 +4,4 @@ from .issues import GithubIssueExtraction from .pull_requests import fetch_pull_requests - -class GithubExtraction(GithubIssueExtraction): - pass +from .github_extraction import GithubExtraction \ No newline at end of file diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py new file mode 100644 index 00000000..df51410c --- /dev/null +++ b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py @@ -0,0 +1,23 @@ +from datetime import datetime + +from hivemind_etl_helpers.src.db.github.extract import GithubIssueExtraction +from hivemind_etl_helpers.src.db.github.schema import GitHubIssue + +class GithubExtraction: + def __init__(self): + # to be uncommented once other pull requests + # regarding `extraction` are ready + # self.commits_extraction = GithubCommitExtraction() + # self.pull_requests_extraction = GithubPullRequestsExtraction() + # self.comment_extraction = GitHubCommentExtraction() + self.issue_extraction = GithubIssueExtraction() + + def fetch_issues( + self, + repository_id: list[int], + from_date: datetime | None = None, + **kwargs + ) -> list[GitHubIssue]: + return self.issue_extraction.fetch_issues( + repository_id, from_date, **kwargs + ) \ No newline at end of file diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py index 984f69e0..2a9b2013 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/issues.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/issues.py @@ -7,9 +7,15 @@ class GithubIssueExtraction: def __init__(self): - pass + """ + Initializes the GitHubCommentExtraction class + without requiring any parameters. + Establishes a connection to the Neo4j database. + """ + self.neo4j_connection = Neo4jConnection() + self.neo4j_driver = self.neo4j_connection.connect_neo4j() - def __fetch_raw_issues( + def _fetch_raw_issues( self, repository_id: list[int], from_date: datetime | None = None, @@ -30,8 +36,7 @@ def __fetch_raw_issues( raw_records : list[neo4j._data.Record] list of neo4j records as the extracted issues """ - neo4j_connection = Neo4jConnection() - neo4j_driver = neo4j_connection.connect_neo4j() + query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) WHERE i.repository_id IN $repoIds @@ -62,7 +67,7 @@ def _exec_query(tx, repoIds, from_date): result = tx.run(query, repoIds=repoIds, from_date=from_date) return list(result) - with neo4j_driver.session() as session: + with self.neo4j_driver.session() as session: raw_records = session.execute_read( _exec_query, repoIds=repository_id, @@ -92,7 +97,7 @@ def fetch_issues( github_issues : list[GitHubIssue] list of neo4j records as the extracted issues """ - records = self.__fetch_raw_issues(repository_id, from_date) + records = self._fetch_raw_issues(repository_id, from_date) github_issues: list[GitHubIssue] = [] for record in records: @@ -122,7 +127,7 @@ def fetch_issue_ids( github_issues_ids : list[GitHubIssueID] list of neo4j records as the extracted issue ids """ - records = self.__fetch_raw_issues(repository_id, from_date) + records = self._fetch_raw_issues(repository_id, from_date) github_issue_ids: list[GitHubIssueID] = [] for record in records: diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py index 3fc0b1da..473dcf6b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_raw_issues.py @@ -15,14 +15,14 @@ def setUp(self) -> None: def test_get_empty_results_no_from_date(self): repository_ids = [123, 124] - issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( + issues = self.extractor._fetch_raw_issues( repository_id=repository_ids, from_date=None ) self.assertEqual(issues, []) def test_get_empty_results(self): repository_ids = [123, 124] - issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( + issues = self.extractor._fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) self.assertEqual(issues, []) @@ -63,7 +63,7 @@ def test_get_single_issue_single_repo(self): ) repository_ids = [123] - issues = self.extractor._GithubIssueExtraction__fetch_raw_issues( + issues = self.extractor._fetch_raw_issues( repository_id=repository_ids, from_date=datetime(2024, 1, 1) ) From 3dc679dfeb0540b63cc3faade4fbc70d2a0d70b8 Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 18 Apr 2024 13:02:59 +0200 Subject: [PATCH 14/15] Bounty 1 (&2) recommended fixes, + bug fixes --- .../src/db/github/extract/__init__.py | 3 +-- .../db/github/extract/github_extraction.py | 23 +++++++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py index 0c5368e0..5ab83746 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/__init__.py @@ -1,7 +1,6 @@ # flake8: noqa from .comments import fetch_comments from .commit import fetch_commits +from .github_extraction import GithubExtraction from .issues import GithubIssueExtraction from .pull_requests import fetch_pull_requests - -from .github_extraction import GithubExtraction \ No newline at end of file diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py index df51410c..79a61047 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py @@ -1,8 +1,10 @@ from datetime import datetime +import neo4j from hivemind_etl_helpers.src.db.github.extract import GithubIssueExtraction from hivemind_etl_helpers.src.db.github.schema import GitHubIssue + class GithubExtraction: def __init__(self): # to be uncommented once other pull requests @@ -12,12 +14,19 @@ def __init__(self): # self.comment_extraction = GitHubCommentExtraction() self.issue_extraction = GithubIssueExtraction() + def _fetch_raw_issues( + self, repository_id: list[int], from_date: datetime | None = None, **kwargs + ) -> list[neo4j._data.Record]: + return self.issue_extraction._fetch_raw_issues( + repository_id, from_date, **kwargs + ) + def fetch_issues( - self, - repository_id: list[int], - from_date: datetime | None = None, - **kwargs + self, repository_id: list[int], from_date: datetime | None = None, **kwargs ) -> list[GitHubIssue]: - return self.issue_extraction.fetch_issues( - repository_id, from_date, **kwargs - ) \ No newline at end of file + return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs) + + def fetch_issue_ids( + self, repository_id: list[int], from_date: datetime | None = None, **kwargs + ) -> list[GitHubIssue]: + return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs) From fe72a608bd766fb6957c3be4af9e751e18780100 Mon Sep 17 00:00:00 2001 From: Jacob Date: Thu, 18 Apr 2024 18:33:38 +0200 Subject: [PATCH 15/15] Bounty 1 (&2) recommended fixes, + bug fixes. --- .../src/db/github/extract/github_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py index 79a61047..133bdde5 100644 --- a/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py +++ b/dags/hivemind_etl_helpers/src/db/github/extract/github_extraction.py @@ -1,7 +1,7 @@ from datetime import datetime import neo4j -from hivemind_etl_helpers.src.db.github.extract import GithubIssueExtraction +from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction from hivemind_etl_helpers.src.db.github.schema import GitHubIssue