forked from gwu-libraries/batch-loader
-
Notifications
You must be signed in to change notification settings - Fork 3
/
get_file.py
201 lines (183 loc) · 7 KB
/
get_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import re
import os
import time
import json
import subprocess
import getpass
from urllib.parse import unquote
import tempfile
import xml.etree.ElementTree as xtree
from lxml import etree
import requests
import validators
from FormatLog import FormatLogger
logger = FormatLogger()
#written for WPI ingesting from URL
class UrlException(ValueError):
pass
def create_tiff_imagemagick(file):
"""
Desc:generates a tiff from the file given using image magick and subprocces
Args: file (str): path to file which a tiff should be generated for
Returns: path to newly created tiff
"""
logger.info("creating tiff for",file,'...')
tiff = os.path.splitext(file)[0] + '.tiff'
return_code = subprocess.run(['convert',file,tiff], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
# if return_code != 0:
# raise Exception("non zero return code for image magick convert, if you are on windows this doesnt work.\ncommand:convert {} {}".format(file,tiff))
if os.path.exists(tiff):
return tiff
logger.error('Could not create TIFF')
raise Exception("image magick convert failed to produce tiff, if you are on windows this doesnt work use magick convert instead.\n\t command: convert {} {}".format(file,tiff))
def create_dir_for(files):
"""
Desc: creates a directory in the parent dir of the first file in the list,
then adds all files to said dir and returns the Directory
Args: files (list): the list of files to be moved to a dir
Returns: the abspath to the dir
"""
parentdir = os.path.dirname(files[0])
tmpdir = tempfile.mkdtemp(dir=parentdir)
for path in files:
file_name = os.path.basename(path)
os.rename(path,os.path.join(tmpdir,file_name)) # move the file into the temporay dir basically mv(source=path,dest=tmpdir)
return tmpdir
def get_file_name_from_url(url):
"""
Desc: finds the rightmost / and gets the rest of the url
ie www.blah.blah/blah/blah/file_name%20original.pdf => file_name%20original.pdf
use use urllib's unquote() to turn url encoding to normal chars like '%20' to ' '
"""
match = re.search("[/][^/]+[/]$",url)
if match:#Directory with / at the end
start = match.start() +1
end = match.end() -1
fileName = url[start:end]
fileName = unquote(fileName)
return fileName
match = re.search("[/][^/]+$",url)
if match:
start = match.start() +1
end = match.end()
fileName = url[start:end]
fileName = unquote(fileName)
return fileName
logger.error('could not parse file name',url)
raise ValueError('unable to figure anything out whatso ever {} '.format(url))
def grant_access(path,rights = '775'):
this_user = getpass.getuser()
subprocess.run(['sudo','chmod',rights,path], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
return subprocess.run(['sudo','chown',this_user,path], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
def mv(path,new_path,args = None):
if args is None:
args = []
status = subprocess.run(['mv',path,new_path]+args, stdout=subprocess.PIPE)
if status:
return
return subprocess.run(['sudo','mv',path,new_path]+args, stdout=subprocess.PIPE)
def download_file(url, dwnld_dir=None, auth_enable=False, auth_user=None, auth_pass=None):
""" if the given url is valid and we have access to the file attached to it. this funciton
will download said file to the directory given or just put it in the current dir.
args:
url: the url
dwnld_dir: the path to dir to download to
"""
local_filename = get_file_name_from_url(url)
if dwnld_dir is not None:
if dwnld_dir[-1] == '/':
local_filename = dwnld_dir+local_filename
else:
local_filename = dwnld_dir+'/'+local_filename
else:# dwnld_dir is None
dwnld_dir = '.'
if not os.path.exists(dwnld_dir):
mkdir(dwnld_dir,['-p'])#make directory and make all directories that dont exist on the way
# NOTE the stream=True parameter
attempts = 0
while True:
attempts+=1
try:
if not validators.url(url.replace('[','B').replace(']','Be')):
logger.error('Invalid url: {}'.format(url))
raise UrlException('Invalid url: {}'.format(url))
if auth_enable:
# 1. login
login = {}
login['name'] = auth_user
login['pass'] = auth_pass
l = json.dumps(login)
req = requests.post("https://eprojects.wpi.edu/user/login?_format=json", data=l)
# 2. download
auth = {}
auth['head'] = {}
auth['cookie'] = {}
if req.status_code == 200:
r_json = json.loads(req.text)
auth['head']['X-CSRF-Token'] = r_json['csrf_token']
auth['head']['Content-Type'] = 'application/json'
auth['cookie'] = req.cookies.get_dict()
r = requests.get(url, stream=True, headers=auth['head'], cookies=auth['cookie'])
else:
r = requests.get(url, stream=True)
break
except requests.exceptions.ConnectionError as e:
logger.error('Can not connect...\n',e,'\n',url)
if attempts >=3:
raise UrlException('Could not connect to server to download file')
time.sleep(2)
if 200 <= r.status_code <= 299:
try:
if logger.prints <2:
print('downloading file from {}'.format(url))
if 'content-disposition' in r.headers:
cont_disp = r.headers['content-disposition']
elif 'Content-Disposition' in r.headers:
cont_disp = r.headers['Content-Disposition']
else:
cont_disp = ""
url_filename = re.findall("filename=(.+)", cont_disp)
if url_filename and url_filename[0]:
fn = url_filename[0]
local_filename = fn.strip('"').strip()
if dwnld_dir[-1] == '/':
local_filename = dwnld_dir+local_filename # put it in download dir
else:
local_filename = dwnld_dir+'/'+local_filename # put it in download dir
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#f.flush() commented by recommendation from J.F.Sebastian
file_size = os.path.getsize(local_filename)
if logger.prints <2:
print('done downloading %s' % (local_filename),"file size:",file_size)
if file_size == 0:
logger.error("file size is 0, file must not have downlaoded correctly")
raise UrlException('Failed to downlaod')
return os.path.abspath(local_filename)
except PermissionError as e:
if dwnld_dir:
print('granting access to file')
if grant_access(dwnld_dir).returncode == 0:
print('success')
return download_file(url,dwnld_dir)
logger.error("could not aquire permission to download to target dir")
raise
text = ''
if r.text is not None:
if len(r.text)>= 100:
text = r.text[:100]+'...'
else:
text = r.text
logger.error('failed to download file error:{}, {}'.format(r.status_code,url))
raise UrlException('failed to download file.@{} code:{},body:{}'.format(url,r.status_code,text))
def mkdir(path,args = None):
if args is None:
args = []
status = subprocess.run(['mkdir']+args+[path], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
if status.returncode == 0:
return
this_user = getpass.getuser()
subprocess.run(['sudo','mkdir','-m','775']+args+[path], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
return subprocess.run(['sudo','chown',this_user,path], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)