-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
98 hivemind refactor GitHub issue extraction functions to object orientated code #107
base: main
Are you sure you want to change the base?
Changes from all commits
be0fb2d
fe6884d
9ee4747
6b12419
549699b
3278cd5
4d2f753
ef4da20
db159cc
4d6e3f6
1cbbf69
06fa34e
01fed3d
3dc679d
fe72a60
6efc3f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
# flake8: noqa | ||
from .comments import fetch_comments | ||
from .commit import fetch_commits | ||
from .issues import fetch_issues | ||
from .github_extraction import GithubExtraction | ||
from .issues import GithubIssueExtraction | ||
from .pull_requests import fetch_pull_requests |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from datetime import datetime | ||
|
||
import neo4j | ||
from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction | ||
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue | ||
|
||
|
||
class GithubExtraction: | ||
def __init__(self): | ||
# to be uncommented once other pull requests | ||
# regarding `extraction` are ready | ||
# self.commits_extraction = GithubCommitExtraction() | ||
# self.pull_requests_extraction = GithubPullRequestsExtraction() | ||
# self.comment_extraction = GitHubCommentExtraction() | ||
self.issue_extraction = GithubIssueExtraction() | ||
|
||
def _fetch_raw_issues( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there any specific reason we included the I think having the fetch_issues would be enough in our There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, the _raw methods are still called from GitHubExtraction inside the test cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes I see your point. To access that in test cases you can call the self.extractor.issue_extraction._fetch_raw_issues |
||
self, repository_id: list[int], from_date: datetime | None = None, **kwargs | ||
) -> list[neo4j._data.Record]: | ||
return self.issue_extraction._fetch_raw_issues( | ||
repository_id, from_date, **kwargs | ||
) | ||
|
||
def fetch_issues( | ||
self, repository_id: list[int], from_date: datetime | None = None, **kwargs | ||
) -> list[GitHubIssue]: | ||
return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs) | ||
|
||
def fetch_issue_ids( | ||
self, repository_id: list[int], from_date: datetime | None = None, **kwargs | ||
) -> list[GitHubIssue]: | ||
return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,96 +2,136 @@ | |
|
||
import neo4j | ||
from github.neo4j_storage.neo4j_connection import Neo4jConnection | ||
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue | ||
|
||
|
||
def fetch_raw_issues( | ||
repository_id: list[int], | ||
from_date: datetime | None = None, | ||
) -> list[neo4j._data.Record]: | ||
""" | ||
fetch raw issues from data dump in neo4j | ||
|
||
Parameters | ||
------------ | ||
repository_id : list[int] | ||
a list of repository id to fetch their issues | ||
from_date : datetime | None | ||
get the issues form a specific date that they were created | ||
defualt is `None`, meaning to apply no filtering on data | ||
|
||
Returns | ||
-------- | ||
raw_records : list[neo4j._data.Record] | ||
list of neo4j records as the extracted issues | ||
""" | ||
neo4j_connection = Neo4jConnection() | ||
neo4j_driver = neo4j_connection.connect_neo4j() | ||
query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) | ||
WHERE | ||
i.repository_id IN $repoIds | ||
""" | ||
if from_date is not None: | ||
query += "AND datetime(i.updated_at) >= datetime($from_date)" | ||
|
||
query += """ | ||
MATCH (repo:Repository {id: i.repository_id}) | ||
RETURN | ||
user.login as author_name, | ||
i.id as id, | ||
i.title as title, | ||
i.body as text, | ||
i.state as state, | ||
i.state_reason as state_reason, | ||
i.created_at as created_at, | ||
i.updated_at as updated_at, | ||
i.closed_at as closed_at, | ||
i.latestSavedAt as latest_saved_at, | ||
i.html_url as url, | ||
i.repository_id as repository_id, | ||
repo.full_name as repository_name | ||
ORDER BY datetime(created_at) | ||
""" | ||
|
||
def _exec_query(tx, repoIds, from_date): | ||
result = tx.run(query, repoIds=repoIds, from_date=from_date) | ||
return list(result) | ||
|
||
with neo4j_driver.session() as session: | ||
raw_records = session.execute_read( | ||
_exec_query, | ||
repoIds=repository_id, | ||
from_date=from_date, | ||
) | ||
|
||
return raw_records | ||
|
||
|
||
def fetch_issues( | ||
repository_id: list[int], | ||
from_date: datetime | None = None, | ||
) -> list[GitHubIssue]: | ||
""" | ||
fetch issues from data dump in neo4j | ||
|
||
Parameters | ||
------------ | ||
repository_id : list[int] | ||
a list of repository id to fetch their issues | ||
from_date : datetime | None | ||
get the issues form a specific date that they were created | ||
defualt is `None`, meaning to apply no filtering on data | ||
|
||
Returns | ||
-------- | ||
github_issues : list[GitHubIssue] | ||
list of neo4j records as the extracted issues | ||
""" | ||
records = fetch_raw_issues(repository_id, from_date) | ||
|
||
github_issues: list[GitHubIssue] = [] | ||
for record in records: | ||
issue = GitHubIssue.from_dict(record) | ||
github_issues.append(issue) | ||
|
||
return github_issues | ||
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID | ||
|
||
|
||
class GithubIssueExtraction: | ||
def __init__(self): | ||
""" | ||
Initializes the GitHubCommentExtraction class | ||
without requiring any parameters. | ||
Establishes a connection to the Neo4j database. | ||
""" | ||
self.neo4j_connection = Neo4jConnection() | ||
self.neo4j_driver = self.neo4j_connection.connect_neo4j() | ||
|
||
def _fetch_raw_issues( | ||
self, | ||
repository_id: list[int], | ||
from_date: datetime | None = None, | ||
) -> list[neo4j._data.Record]: | ||
""" | ||
fetch raw issues from data dump in neo4j | ||
|
||
Parameters | ||
------------ | ||
repository_id : list[int] | ||
a list of repository id to fetch their issues | ||
from_date : datetime | None | ||
get the issues form a specific date that they were created | ||
default is `None`, meaning to apply no filtering on data | ||
|
||
Returns | ||
-------- | ||
raw_records : list[neo4j._data.Record] | ||
list of neo4j records as the extracted issues | ||
""" | ||
|
||
query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser) | ||
WHERE | ||
i.repository_id IN $repoIds | ||
""" | ||
if from_date is not None: | ||
query += "AND datetime(i.updated_at) >= datetime($from_date)" | ||
|
||
query += """ | ||
MATCH (repo:Repository {id: i.repository_id}) | ||
RETURN | ||
user.login as author_name, | ||
i.id as id, | ||
i.title as title, | ||
i.body as text, | ||
i.state as state, | ||
i.state_reason as state_reason, | ||
i.created_at as created_at, | ||
i.updated_at as updated_at, | ||
i.closed_at as closed_at, | ||
i.latestSavedAt as latest_saved_at, | ||
i.html_url as url, | ||
i.repository_id as repository_id, | ||
repo.full_name as repository_name | ||
ORDER BY datetime(created_at) | ||
""" | ||
|
||
def _exec_query(tx, repoIds, from_date): | ||
result = tx.run(query, repoIds=repoIds, from_date=from_date) | ||
return list(result) | ||
|
||
with self.neo4j_driver.session() as session: | ||
raw_records = session.execute_read( | ||
_exec_query, | ||
repoIds=repository_id, | ||
from_date=from_date, | ||
) | ||
|
||
return raw_records | ||
|
||
def fetch_issues( | ||
self, | ||
repository_id: list[int], | ||
from_date: datetime | None = None, | ||
) -> list[GitHubIssue]: | ||
""" | ||
fetch issues from data dump in neo4j | ||
|
||
Parameters | ||
------------ | ||
repository_id : list[int] | ||
a list of repository id to fetch their issues | ||
from_date : datetime | None | ||
get the issues form a specific date that they were created | ||
default is `None`, meaning to apply no filtering on data | ||
|
||
Returns | ||
-------- | ||
github_issues : list[GitHubIssue] | ||
list of neo4j records as the extracted issues | ||
""" | ||
records = self._fetch_raw_issues(repository_id, from_date) | ||
|
||
github_issues: list[GitHubIssue] = [] | ||
for record in records: | ||
issue = GitHubIssue.from_dict(record) | ||
github_issues.append(issue) | ||
|
||
return github_issues | ||
|
||
def fetch_issue_ids( | ||
amindadgar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self, | ||
repository_id: list[int], | ||
from_date: datetime | None = None, | ||
) -> list[GitHubIssueID]: | ||
""" | ||
fetch issues from data dump in neo4j | ||
|
||
Parameters | ||
------------ | ||
repository_id : list[int] | ||
a list of repository id to fetch their issues | ||
from_date : datetime | None | ||
get the issues form a specific date that they were created | ||
default is `None`, meaning to apply no filtering on data | ||
|
||
Returns | ||
-------- | ||
github_issues_ids : list[GitHubIssueID] | ||
list of neo4j records as the extracted issue ids | ||
""" | ||
records = self._fetch_raw_issues(repository_id, from_date) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you write another function for just fetching the ids? Fetching the whole data and just using their ids is inefficient. You can name the method as |
||
|
||
github_issue_ids: list[GitHubIssueID] = [] | ||
for record in records: | ||
issue = GitHubIssueID.from_dict(record) | ||
github_issue_ids.append(issue) | ||
|
||
return github_issue_ids | ||
Comment on lines
+8
to
+137
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tip Codebase Verification The methods
Analysis chainThe Scripts ExecutedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Verify the usage of `_fetch_raw_issues`, `fetch_issues`, and `fetch_issue_ids` methods in the codebase.
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "_fetch_raw_issues"
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "fetch_issues"
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "fetch_issue_ids"
Length of output: 1470 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# flake8: noqa | ||
from .comment import GitHubComment | ||
from .commit import GitHubCommit | ||
from .issue import GitHubIssue | ||
from .issue import GitHubIssue, GitHubIssueID | ||
amindadgar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from .pull_request import GitHubPullRequest |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please consider removing the line 5 (
from .issues import GithubIssueExtraction
) because this is not always required to be used and we're always using GitHubExtractionIf we wanted to use the
GitHubIssueExtraction
we can just import from its file and not this module directory.