Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

98 hivemind refactor GitHub issue extraction functions to object orientated code #107

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dags/hivemind_etl_helpers/github_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

from dotenv import load_dotenv
from hivemind_etl_helpers.src.db.github.extract import (
GithubExtraction,
fetch_comments,
fetch_commits,
fetch_issues,
fetch_pull_requests,
)
from hivemind_etl_helpers.src.db.github.github_organization_repos import (
Expand Down Expand Up @@ -61,9 +61,13 @@ def process_github_vectorstore(
logging.info(f"{len(repository_ids)} repositories to fetch data from!")

# EXTRACT
extractor = GithubExtraction()

github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date)
github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date)
github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date)
github_issues = extractor.fetch_issues(
repository_id=repository_ids, from_date=from_date
)
github_prs = fetch_pull_requests(
repository_id=repository_ids,
from_date_created=from_starting_date,
Expand Down
3 changes: 2 additions & 1 deletion dags/hivemind_etl_helpers/src/db/github/extract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# flake8: noqa
from .comments import fetch_comments
from .commit import fetch_commits
from .issues import fetch_issues
from .github_extraction import GithubExtraction
from .issues import GithubIssueExtraction
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please consider removing the line 5 (from .issues import GithubIssueExtraction) because this is not always required to be used and we're always using GitHubExtraction

If we wanted to use the GitHubIssueExtraction we can just import from its file and not this module directory.

from .pull_requests import fetch_pull_requests
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from datetime import datetime

import neo4j
from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue


class GithubExtraction:
def __init__(self):
# to be uncommented once other pull requests
# regarding `extraction` are ready
# self.commits_extraction = GithubCommitExtraction()
# self.pull_requests_extraction = GithubPullRequestsExtraction()
# self.comment_extraction = GitHubCommentExtraction()
self.issue_extraction = GithubIssueExtraction()

def _fetch_raw_issues(
Copy link
Member

@amindadgar amindadgar Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there any specific reason we included the _fetch_raw_issues within GitHubExtraction?

I think having the fetch_issues would be enough in our GitHubExtraction

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, the _raw methods are still called from GitHubExtraction inside the test cases.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yes I see your point. To access that in test cases you can call the _fetch_raw_issues directly from GitHubExtraction, you can do the following within test cases.

self.extractor.issue_extraction._fetch_raw_issues

self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[neo4j._data.Record]:
return self.issue_extraction._fetch_raw_issues(
repository_id, from_date, **kwargs
)

def fetch_issues(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[GitHubIssue]:
return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs)

def fetch_issue_ids(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[GitHubIssue]:
return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs)
226 changes: 133 additions & 93 deletions dags/hivemind_etl_helpers/src/db/github/extract/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,96 +2,136 @@

import neo4j
from github.neo4j_storage.neo4j_connection import Neo4jConnection
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue


def fetch_raw_issues(
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
defualt is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""
neo4j_connection = Neo4jConnection()
neo4j_driver = neo4j_connection.connect_neo4j()
query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
user.login as author_name,
i.id as id,
i.title as title,
i.body as text,
i.state as state,
i.state_reason as state_reason,
i.created_at as created_at,
i.updated_at as updated_at,
i.closed_at as closed_at,
i.latestSavedAt as latest_saved_at,
i.html_url as url,
i.repository_id as repository_id,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records


def fetch_issues(
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssue]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
defualt is `None`, meaning to apply no filtering on data

Returns
--------
github_issues : list[GitHubIssue]
list of neo4j records as the extracted issues
"""
records = fetch_raw_issues(repository_id, from_date)

github_issues: list[GitHubIssue] = []
for record in records:
issue = GitHubIssue.from_dict(record)
github_issues.append(issue)

return github_issues
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID


class GithubIssueExtraction:
def __init__(self):
"""
Initializes the GitHubCommentExtraction class
without requiring any parameters.
Establishes a connection to the Neo4j database.
"""
self.neo4j_connection = Neo4jConnection()
self.neo4j_driver = self.neo4j_connection.connect_neo4j()

def _fetch_raw_issues(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""

query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
user.login as author_name,
i.id as id,
i.title as title,
i.body as text,
i.state as state,
i.state_reason as state_reason,
i.created_at as created_at,
i.updated_at as updated_at,
i.closed_at as closed_at,
i.latestSavedAt as latest_saved_at,
i.html_url as url,
i.repository_id as repository_id,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with self.neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records

def fetch_issues(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssue]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
github_issues : list[GitHubIssue]
list of neo4j records as the extracted issues
"""
records = self._fetch_raw_issues(repository_id, from_date)

github_issues: list[GitHubIssue] = []
for record in records:
issue = GitHubIssue.from_dict(record)
github_issues.append(issue)

return github_issues

def fetch_issue_ids(
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssueID]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
github_issues_ids : list[GitHubIssueID]
list of neo4j records as the extracted issue ids
"""
records = self._fetch_raw_issues(repository_id, from_date)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you write another function for just fetching the ids? Fetching the whole data and just using their ids is inefficient. You can name the method as _fetch_raw_issue_ids Please let me know if you need any help on the neo4j query.


github_issue_ids: list[GitHubIssueID] = []
for record in records:
issue = GitHubIssueID.from_dict(record)
github_issue_ids.append(issue)

return github_issue_ids
Comment on lines +8 to +137
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tip

Codebase Verification

The methods _fetch_raw_issues, fetch_issues, and fetch_issue_ids from the GithubIssueExtraction class are actively used within the codebase. However, the method _fetch_raw_issues is accessed from outside its own class (GithubIssueExtraction), specifically in github_extraction.py. This is not typical for a method prefixed with an underscore, which conventionally indicates it should be a private method. This might suggest a need to reconsider the method's visibility (i.e., renaming it to remove the underscore if it's intended to be used externally).

  • issues.py and github_extraction.py both use _fetch_raw_issues.
  • fetch_issues and fetch_issue_ids are used within their own class and also in github_extraction.py.
Analysis chain

The GithubIssueExtraction class is well-implemented, encapsulating the logic for fetching issues and issue IDs. Ensure that the methods _fetch_raw_issues, fetch_issues, and fetch_issue_ids are used correctly and tested thoroughly.

Scripts Executed

The following scripts were executed for the analysis:


Script:

#!/bin/bash
# Verify the usage of `_fetch_raw_issues`, `fetch_issues`, and `fetch_issue_ids` methods in the codebase.
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "_fetch_raw_issues"
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "fetch_issues"
rg --files-with-matches --type py "GithubIssueExtraction" | xargs rg "fetch_issue_ids"

Length of output: 1470

2 changes: 1 addition & 1 deletion dags/hivemind_etl_helpers/src/db/github/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# flake8: noqa
from .comment import GitHubComment
from .commit import GitHubCommit
from .issue import GitHubIssue
from .issue import GitHubIssue, GitHubIssueID
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
from .pull_request import GitHubPullRequest
19 changes: 19 additions & 0 deletions dags/hivemind_etl_helpers/src/db/github/schema/issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,22 @@ def to_dict(self) -> dict[str, str | int | None]:
"repository_name": self.repository_name,
"type": "issue",
}


class GitHubIssueID(GitHubIssue):
def __init__(
self,
id: int,
) -> None:
self.id = id

@classmethod
def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID":
return cls(
id=issue["id"], # type: ignore
)

def to_dict(self) -> dict[str, str | int | None]:
return {
"id": self.id,
}
Loading
Loading