Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functional to OOP refactor for fetch_issues method #105

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dags/hivemind_etl_helpers/github_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

from dotenv import load_dotenv
from hivemind_etl_helpers.src.db.github.extract import (
GithubExtraction,
fetch_comments,
fetch_commits,
fetch_issues,
fetch_pull_requests,
)
from hivemind_etl_helpers.src.db.github.github_organization_repos import (
Expand Down Expand Up @@ -61,9 +61,13 @@ def process_github_vectorstore(
logging.info(f"{len(repository_ids)} repositories to fetch data from!")

# EXTRACT
extractor = GithubExtraction()

github_comments = fetch_comments(repository_id=repository_ids, from_date=from_date)
github_commits = fetch_commits(repository_id=repository_ids, from_date=from_date)
github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date)
github_issues = extractor.fetch_issues(
repository_id=repository_ids, from_date=from_date
)
github_prs = fetch_pull_requests(
repository_id=repository_ids,
from_date_created=from_starting_date,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# flake8: noqa
from .comments import fetch_comments
from .commit import fetch_commits
from .issues import fetch_issues
from .github_extraction import GithubExtraction
from .pull_requests import fetch_pull_requests
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from datetime import datetime

import neo4j
from hivemind_etl_helpers.src.db.github.extract.issues import GithubIssueExtraction
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue


class GithubExtraction:
def __init__(self):
# to be uncommented once other pull requests
# regarding `extraction` are ready
# self.commits_extraction = GithubCommitExtraction()
# self.pull_requests_extraction = GithubPullRequestsExtraction()
# self.comment_extraction = GitHubCommentExtraction()
self.issue_extraction = GithubIssueExtraction()

def _fetch_raw_issues(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove this as we wouldn't use it directly.

self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[neo4j._data.Record]:
return self.issue_extraction._fetch_raw_issues(
repository_id, from_date, **kwargs
)

def fetch_issues(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[GitHubIssue]:
return self.issue_extraction.fetch_issues(repository_id, from_date, **kwargs)

def _fetch_raw_issue_ids(
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[neo4j._data.Record]:
return self.issue_extraction._fetch_raw_issue_ids(
repository_id, from_date, **kwargs
)

def fetch_issue_ids(
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
self, repository_id: list[int], from_date: datetime | None = None, **kwargs
) -> list[GitHubIssue]:
return self.issue_extraction.fetch_issue_ids(repository_id, from_date, **kwargs)
277 changes: 184 additions & 93 deletions dags/hivemind_etl_helpers/src/db/github/extract/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,96 +2,187 @@

import neo4j
from github.neo4j_storage.neo4j_connection import Neo4jConnection
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue


def fetch_raw_issues(
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
defualt is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""
neo4j_connection = Neo4jConnection()
neo4j_driver = neo4j_connection.connect_neo4j()
query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
user.login as author_name,
i.id as id,
i.title as title,
i.body as text,
i.state as state,
i.state_reason as state_reason,
i.created_at as created_at,
i.updated_at as updated_at,
i.closed_at as closed_at,
i.latestSavedAt as latest_saved_at,
i.html_url as url,
i.repository_id as repository_id,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records


def fetch_issues(
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssue]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
defualt is `None`, meaning to apply no filtering on data

Returns
--------
github_issues : list[GitHubIssue]
list of neo4j records as the extracted issues
"""
records = fetch_raw_issues(repository_id, from_date)

github_issues: list[GitHubIssue] = []
for record in records:
issue = GitHubIssue.from_dict(record)
github_issues.append(issue)

return github_issues
from hivemind_etl_helpers.src.db.github.schema import GitHubIssue, GitHubIssueID


class GithubIssueExtraction:
def __init__(self):
"""
Initializes the GitHubCommentExtraction class
without requiring any parameters.
Establishes a connection to the Neo4j database.
"""
self.neo4j_connection = Neo4jConnection()
self.neo4j_driver = self.neo4j_connection.connect_neo4j()

def _fetch_raw_issues(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""

query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
user.login as author_name,
i.id as id,
i.title as title,
i.body as text,
i.state as state,
i.state_reason as state_reason,
i.created_at as created_at,
i.updated_at as updated_at,
i.closed_at as closed_at,
i.latestSavedAt as latest_saved_at,
i.html_url as url,
i.repository_id as repository_id,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with self.neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records

def _fetch_raw_issue_ids(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[neo4j._data.Record]:
"""
fetch raw issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
raw_records : list[neo4j._data.Record]
list of neo4j records as the extracted issues
"""

query = """MATCH (i:Issue)<-[:CREATED]-(user:GitHubUser)
WHERE
i.repository_id IN $repoIds
"""
if from_date is not None:
query += "AND datetime(i.updated_at) >= datetime($from_date)"

query += """
MATCH (repo:Repository {id: i.repository_id})
RETURN
i.id as id,
i.created_at as created_at,
repo.full_name as repository_name
ORDER BY datetime(created_at)
"""

def _exec_query(tx, repoIds, from_date):
result = tx.run(query, repoIds=repoIds, from_date=from_date)
return list(result)

with self.neo4j_driver.session() as session:
raw_records = session.execute_read(
_exec_query,
repoIds=repository_id,
from_date=from_date,
)

return raw_records

def fetch_issues(
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssue]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
github_issues : list[GitHubIssue]
list of neo4j records as the extracted issues
"""
records = self._fetch_raw_issues(repository_id, from_date)

github_issues: list[GitHubIssue] = []
for record in records:
issue = GitHubIssue.from_dict(record)
github_issues.append(issue)

return github_issues

def fetch_issue_ids(
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
self,
repository_id: list[int],
from_date: datetime | None = None,
) -> list[GitHubIssueID]:
"""
fetch issues from data dump in neo4j

Parameters
------------
repository_id : list[int]
a list of repository id to fetch their issues
from_date : datetime | None
get the issues form a specific date that they were created
default is `None`, meaning to apply no filtering on data

Returns
--------
github_issues_ids : list[GitHubIssueID]
list of neo4j records as the extracted issue ids
"""
records = self._fetch_raw_issue_ids(repository_id, from_date)

github_issue_ids: list[GitHubIssueID] = []
for record in records:
issue = GitHubIssueID.from_dict(record)
github_issue_ids.append(issue)

return github_issue_ids
2 changes: 1 addition & 1 deletion dags/hivemind_etl_helpers/src/db/github/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# flake8: noqa
from .comment import GitHubComment
from .commit import GitHubCommit
from .issue import GitHubIssue
from .issue import GitHubIssue, GitHubIssueID
from .pull_request import GitHubPullRequest
19 changes: 19 additions & 0 deletions dags/hivemind_etl_helpers/src/db/github/schema/issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,22 @@ def to_dict(self) -> dict[str, str | int | None]:
"repository_name": self.repository_name,
"type": "issue",
}


class GitHubIssueID(GitHubIssue):
def __init__(
self,
id: int,
) -> None:
self.id = id

@classmethod
def from_dict(cls, issue: dict[str, str | int]) -> "GitHubIssueID":
return cls(
id=issue["id"], # type: ignore
)

def to_dict(self) -> dict[str, str | int | None]:
return {
"id": self.id,
}
Loading
Loading