Skip to content

Commit

Permalink
Merge pull request #347 from HebaruSan/fix/disable-ia-dl-counts
Browse files Browse the repository at this point in the history
Disable archive.org queries after timeout
  • Loading branch information
HebaruSan authored Oct 20, 2024
2 parents 6b9e530 + 182094c commit 1179585
Showing 1 changed file with 33 additions and 34 deletions.
67 changes: 33 additions & 34 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from .metadata import Ckan


class GraphQLQuery:
class GitHubBatchedQuery:

PATH_PATTERN = re.compile(r'^/([^/]+)/([^/]+)')

# The URL that handles GitHub GraphQL requests
GITHUB_API = 'https://api.github.com/graphql'
Expand Down Expand Up @@ -137,6 +139,8 @@ def sum_graphql_result(self, apidata: Dict[str, Any]) -> int:

class SpaceDockBatchedQuery:

PATH_PATTERN = re.compile(r'^/mod/([^/]+)')

SPACEDOCK_API = 'https://spacedock.info/api/download_counts'

def __init__(self) -> None:
Expand Down Expand Up @@ -184,7 +188,6 @@ class InternetArchiveBatchedQuery:

def __init__(self) -> None:
self.ids: Dict[str, str] = {}
self.connect_timed_out = False

def empty(self) -> bool:
return len(self.ids) == 0
Expand All @@ -198,28 +201,21 @@ def add(self, ckan: Ckan) -> None:
def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
if counts is None:
counts = {}
if self.connect_timed_out:
return counts
try:
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
try:
counts[ckan_ident] = counts.get(ckan_ident, 0) + result[ia_ident]['all_time']
except KeyError as exc:
logging.error('InternetArchive id not found in downloads result: %s',
ia_ident, exc_info=exc)
return counts
except ConnectTimeout as exc:
# Cleanly turn off archive.org counting while the downtime continues
logging.error('Failed to get counts from archive.org',
exc_info=exc)
self.connect_timed_out = True
return counts
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
try:
counts[ckan_ident] = counts.get(ckan_ident, 0) + result[ia_ident]['all_time']
except KeyError as exc:
logging.error('InternetArchive id not found in downloads result: %s',
ia_ident, exc_info=exc)
return counts


class SourceForgeQuerier:

PATH_PATTERN = re.compile(r'^/project/([^/]+)')

# https://sourceforge.net/p/forge/documentation/Download%20Stats%20API/
API_TEMPLATE = Template('https://sourceforge.net/projects/${proj_id}/files/stats/json'
'?start_date=2010-01-01&end_date=${today}'
Expand All @@ -245,10 +241,6 @@ def get_result(cls, ident: str, proj_id: str,

class DownloadCounter:

GITHUB_PATH_PATTERN = re.compile(r'^/([^/]+)/([^/]+)')
SPACEDOCK_PATH_PATTERN = re.compile(r'^/mod/([^/]+)')
SOURCEFORGE_PATH_PATTERN = re.compile(r'^/project/([^/]+)')

def __init__(self, game_id: str, ckm_repo: CkanMetaRepo, github_token: str) -> None:
self.game_id = game_id
self.ckm_repo = ckm_repo
Expand All @@ -260,17 +252,17 @@ def __init__(self, game_id: str, ckm_repo: CkanMetaRepo, github_token: str) -> N
)

def get_counts(self) -> None:
graph_query = GraphQLQuery(self.github_token)
graph_query = GitHubBatchedQuery(self.github_token)
sd_query = SpaceDockBatchedQuery()
ia_query = InternetArchiveBatchedQuery()
ia_query: Optional[InternetArchiveBatchedQuery] = InternetArchiveBatchedQuery()
for ckan in self.ckm_repo.all_latest_modules(): # pylint: disable=too-many-nested-blocks
if ckan.kind == 'dlc':
continue
for download in ckan.downloads:
try:
url_parse = urllib.parse.urlparse(download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
match = GitHubBatchedQuery.PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
Expand All @@ -280,20 +272,27 @@ def get_counts(self) -> None:
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
match = SpaceDockBatchedQuery.PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
if ia_query:
ia_query.add(ckan)
if ia_query.full():
try:
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except ConnectTimeout as exc:
# Cleanly turn off archive.org counting while the downtime continues
logging.error('Failed to get counts from archive.org',
exc_info=exc)
ia_query = None
elif url_parse.netloc.endswith('.sourceforge.net'):
match = self.SOURCEFORGE_PATH_PATTERN.match(url_parse.path)
match = SourceForgeQuerier.PATH_PATTERN.match(url_parse.path)
if match:
SourceForgeQuerier.get_result(ckan.identifier, match.group(1),
self.counts)
Expand All @@ -310,7 +309,7 @@ def get_counts(self) -> None:
if not graph_query.empty():
# Final pass doesn't overflow the bound
graph_query.get_result(self.counts)
if not ia_query.empty():
if ia_query and not ia_query.empty():
ia_query.get_result(self.counts)

def write_json(self) -> None:
Expand Down

0 comments on commit 1179585

Please sign in to comment.