Skip to content

Commit

Permalink
get: ena: Support --output-directory.
Browse files Browse the repository at this point in the history
Reported by @mbhall88

#33
  • Loading branch information
wwood committed Dec 18, 2023
1 parent 2e697f3 commit 0fa302f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 35 deletions.
15 changes: 9 additions & 6 deletions kingfisher/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def download_from_aws(odp_link, run_identifier, download_threads, method):
logging.warning("Not using method gcp-cp as --allow-paid was not specified")

elif method == 'ena-ascp':
result = EnaDownloader().download_with_aspera(run_identifier, '.',
result = EnaDownloader().download_with_aspera(run_identifier, output_directory,
ascp_args=ascp_args,
ssh_key=ascp_ssh_key,
check_md5sums=check_md5sums)
Expand All @@ -346,6 +346,7 @@ def download_from_aws(odp_link, run_identifier, download_threads, method):
result = EnaDownloader().download_with_curl(
run_identifier,
download_threads,
output_directory,
check_md5sums=check_md5sums)
if result is not False:
gzip_test_files(result)
Expand Down Expand Up @@ -382,9 +383,11 @@ def download_from_aws(odp_link, run_identifier, download_threads, method):
else:
if stdout:
raise Exception("--stdout currently must be via download of a .sra format file, rather than a download from ENA. I imagine this will be fixed in future.")
if 'fastq.gz' not in output_format_possibilities:
if 'fastq.gz' in output_format_possibilities:
output_files = downloaded_files
else:
for fq in ['x_1.fastq.gz','x_2.fastq.gz','x.fastq.gz']:
f = fq.replace('x',run_identifier)
f = output_location_factory.output_stem(fq.replace('x',run_identifier))
if os.path.exists(f):
# Do the least work, currently we have FASTQ.gz
if 'fasta' in output_format_possibilities:
Expand All @@ -409,9 +412,9 @@ def download_from_aws(odp_link, run_identifier, download_threads, method):
output_files.append(f.replace('.fastq.gz','.fastq'))
else:
raise Exception("Programming error")

else:
output_files = downloaded_files
if len(output_files) == 0:
raise Exception("No output files found, something went amiss, unsure what.")

logging.info("Output files: {}".format(', '.join(output_files)))

Expand Down
58 changes: 29 additions & 29 deletions kingfisher/ena.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import logging
import os
import pandas as pd
import hashlib

import extern
import bird_tool_utils

from .md5sum import MD5

Expand Down Expand Up @@ -81,7 +81,6 @@ def download_with_aspera(self, run_id, output_directory, quiet=False, ascp_args=
logging.info("Downloading {} FTP read set(s): {}".format(
len(ftp_urls), ", ".join(ftp_urls)))

aspera_commands = []
output_files = []
for url, md5 in zip(ftp_urls, md5sums):
quiet_args = ''
Expand Down Expand Up @@ -112,36 +111,37 @@ def download_with_aspera(self, run_id, output_directory, quiet=False, ascp_args=
output_files.append(output_file)
return output_files

def download_with_curl(self, run_id, num_threads, check_md5sums=False):
report = self.get_ftp_download_urls(run_id)
if report is False:
return False
ftp_urls = report.file_paths
md5sums = report.md5sums

downloaded = []
for url, md5 in zip(ftp_urls, md5sums):
logging.info("Downloading {} ..".format(url))
output_file = os.path.basename(url)
if num_threads > 1:
cmd = "aria2c -x{} -o {} 'ftp://{}'".format(
num_threads, output_file, url)
else:
cmd = "curl -L '{}' -o {}".format(url, output_file)
try:
subprocess.check_call(cmd, shell=True)
except subprocess.CalledProcessError as e:
logging.warning("Method ena-ftp failed, error was {}".format(e))
self._clean_incomplete_files(downloaded+[output_file])
def download_with_curl(self, run_id, num_threads, output_directory, check_md5sums=False):
with bird_tool_utils.in_working_directory(output_directory):
report = self.get_ftp_download_urls(run_id)
if report is False:
return False

if check_md5sums:
if MD5.check_md5sum(output_file, md5):
logging.info("MD5sum OK for {}".format(output_file))
ftp_urls = report.file_paths
md5sums = report.md5sums

downloaded = []
for url, md5 in zip(ftp_urls, md5sums):
logging.info("Downloading {} ..".format(url))
output_file = os.path.basename(url)
if num_threads > 1:
cmd = "aria2c -x{} -o {} 'ftp://{}'".format(
num_threads, output_file, url)
else:
logging.error("MD5sum failed for {}".format(output_file))
cmd = "curl -L '{}' -o {}".format(url, output_file)
try:
subprocess.check_call(cmd, shell=True)
except subprocess.CalledProcessError as e:
logging.warning("Method ena-ftp failed, error was {}".format(e))
self._clean_incomplete_files(downloaded+[output_file])
return False
downloaded.append(output_file)

if check_md5sums:
if MD5.check_md5sum(output_file, md5):
logging.info("MD5sum OK for {}".format(output_file))
else:
logging.error("MD5sum failed for {}".format(output_file))
self._clean_incomplete_files(downloaded+[output_file])
return False
downloaded.append(os.path.join(output_directory, output_file))
return downloaded

15 changes: 15 additions & 0 deletions test/test_ena.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ def test_fasta_gz_via_ena_ftp(self):
self.assertTrue(os.path.getsize('SRR12118866_1.fasta.gz')==746749)
self.assertTrue(os.path.getsize('SRR12118866_2.fasta.gz')==899862)

def test_fasta_gz_via_ena_ftp_output_directory(self):
with in_tempdir():
extern.run('{} get -r SRR12118866 -m ena-ftp --output-format-possibilities fasta.gz --output-directory outdir'.format(
kingfisher))
self.assertTrue(os.path.getsize('outdir/SRR12118866_1.fasta.gz')==746749)
self.assertTrue(os.path.getsize('outdir/SRR12118866_2.fasta.gz')==899862)

def test_aria2_aws_fasta(self):
'''cannot test this in travis because conda aria2 is broken'''
with in_tempdir():
Expand All @@ -79,6 +86,14 @@ def test_aria2_aws_fasta(self):
self.assertTrue(os.path.getsize('SRR12118866_1.fasta.gz')==757641)
self.assertTrue(os.path.getsize('SRR12118866_2.fasta.gz')==907591)

def test_aria2_aws_fasta_output_directory(self):
'''cannot test this in travis because conda aria2 is broken'''
with in_tempdir():
extern.run('{} get --download-threads 4 -r SRR12118866 -m aws-http --output-format-possibilities fasta.gz --output-directory outdir'.format(
kingfisher))
self.assertTrue(os.path.getsize('outdir/SRR12118866_1.fasta.gz')==757641)
self.assertTrue(os.path.getsize('outdir/SRR12118866_2.fasta.gz')==907591)

def test_aria2_ena_fastq_gz(self):
'''cannot test this in travis because conda aria2 is broken'''
with in_tempdir():
Expand Down

0 comments on commit 0fa302f

Please sign in to comment.