Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bounties/jcwb 00 #139

Open
wants to merge 5 commits into
base: 98-hivemind-refactor-github-issue-extraction-functions-to-object-orientated-code
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
from .comments import fetch_comments
from .commit import fetch_commits
from .github_extraction import GithubExtraction
from .issues import GithubIssueExtraction
from .pull_requests import fetch_pull_requests
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ def fetch_issues(
) -> list[GitHubIssue]:
return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs)

def _fetch_raw_issue_ids(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[neo4j._data.Record]:
return self.issue_extraction._fetch_raw_issue_ids(
repository_id, from_date, **kwargs
)

def fetch_issue_ids(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[GitHubIssue]:
Expand Down
53 changes: 52 additions & 1 deletion dags/hivemind_etl_helpers/src/db/github/extract/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,57 @@ def _exec_query(tx, repoIds, from_date):

return raw_records

def _fetch_raw_issue_ids(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""

query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
i.id as id,
i.created_at as created_at,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with self.neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records

def fetch_issues(
self,
repository_id: list[int],
Expand Down Expand Up @@ -127,7 +178,7 @@ def fetch_issue_ids(
github_issues_ids : list[GitHubIssueID]
list of neo4j records as the extracted issue ids
"""
records = self._fetch_raw_issues(repository_id, from_date)
records = self._fetch_raw_issue_ids(repository_id, from_date)

github_issue_ids: list[GitHubIssueID] = []
for record in records:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from datetime import datetime
from unittest import TestCase

from github.neo4j_storage.neo4j_connection import Neo4jConnection
from hivemind_etl_helpers.src.db.github.extract import GithubExtraction


class TestGithubETLFetchRawIssues(TestCase):
def setUp(self) -> None:
self.extractor = GithubExtraction()
neo4j_connection = Neo4jConnection()
self.neo4j_driver = neo4j_connection.connect_neo4j()
with self.neo4j_driver.session() as session:
session.execute_write(lambda tx: tx.run("MATCH (n) DETACH DELETE (n)"))

def test_get_empty_results_no_from_date(self):
repository_ids = [123, 124]
issues = self.extractor._fetch_raw_issue_ids(
repository_id=repository_ids, from_date=None
)
self.assertEqual(issues, [])

def test_get_empty_results(self):
repository_ids = [123, 124]
issues = self.extractor._fetch_raw_issue_ids(
repository_id=repository_ids, from_date=datetime(2024, 1, 1)
)
self.assertEqual(issues, [])

def test_get_single_issue_single_repo_minimum_info(self):
with self.neo4j_driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"})
SET
i.latestSavedAt = "2024-02-15T06:10:02.262000000Z",
i.comments = 0,
i.created_at = "2024-02-06T10:23:50Z",
i.number = 1,
i.updated_at = "2024-02-06T12:56:05Z",
i.repository_id = 123,
i.id = 21200001,
i.node_id = "some_id"

CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"})
"""
)
)

repository_ids = [123]
issues = self.extractor._fetch_raw_issues(
repository_id=repository_ids, from_date=datetime(2024, 1, 1)
)

self.assertEqual(len(issues), 1)
self.assertEqual(issues[0]["id"], 21200001)

def test_get_single_issue_single_repo_complete_info(self):
with self.neo4j_driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
CREATE (i:Issue)<-[:CREATED]-(:GitHubUser {login: "author #1"})
SET
i.state_reason = "completed",
i.body = "explanation of some sample issue",
i.latestSavedAt = "2024-02-15T06:10:02.262000000Z",
i.closed_at = "2024-02-06T12:56:05Z",
i.comments = 0,
i.created_at = "2024-02-06T10:23:50Z",
i.title = "some sample title",
i.url = "https://api.github.com/repos/GitHub/some_repo/issues/1",
i.author_association = "CONTRIBUTOR",
i.labels_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/labels{/name}",
i.number = 1,
i.updated_at = "2024-02-06T12:56:05Z",
i.events_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/events",
i.html_url = "https://github.com/GitHub/some_repo/issues/1",
i.comments_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/comments",
i.repository_id = 123,
i.id = 21200001,
i.repository_url = "https://api.github.com/repos/GitHub/some_repo",
i.state = "closed",
i.locked = false,
i.timeline_url = "https://api.github.com/repos/GitHub/some_repo/issues/1/timeline",
i.node_id = "some_id"

CREATE (repo:Repository {id: 123, full_name: "Org/SampleRepo"})
"""
)
)

repository_ids = [123]
issues = self.extractor._fetch_raw_issues(
repository_id=repository_ids, from_date=datetime(2024, 1, 1)
)

self.assertEqual(len(issues), 1)
self.assertEqual(issues[0]["id"], 21200001)
Loading