Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configs for user/topic filters and folder/file format #25

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.venv
.venv
config.py
__pycache__
.DS_Store
.vscode
37 changes: 30 additions & 7 deletions config_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@

# Date range (inclusive) for downloads, None value for Days gets replaced by first/last day of the month.
START_DAY, START_MONTH, START_YEAR = None, 5, 2020
END_DAY, END_MONTH, END_YEAR = None , 3, 2022
END_DAY, END_MONTH, END_YEAR = None, 3, 2022
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls revert the added spaces


# Put here emails of the users you want to check for recordings. If empty, all users under the account will be checked.
USERS = [
USERS_INCLUDE = [
# R"####@####.####",
# R"####@####.####",
]

# Put here emails of the users you want to exclude from checking for recordings. Optional.
USERS_EXCLUDE = [
# R"####@####.####",
# R"####@####.####",
]
Comment on lines -14 to 23
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic here isn't clear to me from the docs, What gets precedence?

Expand All @@ -22,6 +28,9 @@
# R"############",
]

# If True, topics that partially match your topic filters are downloaded. If False, only meetings with exact topic matches are downloaded.
PARTIAL_MATCH_TOPICS = False

# Put here the file types you wish to download. If empty, no file type filtering will happen.
RECORDING_FILE_TYPES = [
# R"MP4", # Video file of the recording.
Expand All @@ -32,11 +41,13 @@
# R"SUMMARY", # Summary file of the recording in JSON file format.
]

# If True, recordings will be grouped in folders by their owning user.
GROUP_BY_USER = True

# If True, recordings will be grouped in folders by their topics
GROUP_BY_TOPIC = True
# Group records in a folder hierarchy using the order below.
# Reorder or comment out any of the folder groups below to control the folder hierarchy created to orgainze the downloaded recording files.
GROUP_FOLDERS_BY = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GROUP_BY or FOLDER_HEIRARCHY

# R"YEAR_MONTH", # Recordings will be grouped in folders by their recording start date in yyyy-mm format.
R"USER_EMAIL", # Recordings will be grouped in folders by their owning user's email address.
R"TOPIC", # Recordings will be grouped in folders by their topics.
]

# If True, each instance of recording will be in its own folder (which may contain multiple files).
# Note: One "meeting" can have multiple recording instances.
Expand All @@ -46,6 +57,18 @@
# This works when "Record a separate audio file of each participant" is enabled.
INCLUDE_PARTICIPANT_AUDIO = True

# Recording file name format to use when saving files. Reorder or comment out any file name format pieces below to control the file naming pattern.
# Example: 2023-12-25t143021z__name-of-the-meeting__audio_transcript__ff625374.VTT
FILE_NAME_FORMAT = [
R"RECORDING_START_DATETIME", # Recording start datetime
R"RECORDING_NAME", # Recording name
R"RECORDING_TYPE", # Recoding type
R"FILE_ID", # Recording unique file ID
]
Comment on lines +60 to +67
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit problematic
I use the file_id in the end to detect already downloaded file. I don't think it's a good idea to let users remove it...


# Seperator character(s) to place in between the file name format pieces when building the recording file names.
FILE_NAME_SEPERATOR = "__"

# Set to True for more verbose output
VERBOSE_OUTPUT = False

Expand Down
133 changes: 94 additions & 39 deletions zoom_batch_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def main():

print_filter_warnings()

if CONFIG.PARTIAL_MATCH_TOPICS:
CONFIG.TOPICS = [topic.lower() for topic in CONFIG.TOPICS]

from_date = datetime.datetime(CONFIG.START_YEAR, CONFIG.START_MONTH, CONFIG.START_DAY or 1)
to_date = datetime.datetime(
CONFIG.END_YEAR, CONFIG.END_MONTH, CONFIG.END_DAY or monthrange(CONFIG.END_YEAR, CONFIG.END_MONTH)[1],
Expand All @@ -38,8 +41,11 @@ def print_filter_warnings():
if CONFIG.TOPICS:
utils.print_bright(f'Topics filter is active {CONFIG.TOPICS}')
did_print = True
if CONFIG.USERS:
utils.print_bright(f'Users filter is active {CONFIG.USERS}')
if CONFIG.USERS_INCLUDE:
utils.print_bright(f'Users include filter is active {CONFIG.USERS_INCLUDE}')
did_print = True
if CONFIG.USERS_EXCLUDE:
utils.print_bright(f'Users exclude filter is active {CONFIG.USERS_EXCLUDE}')
did_print = True
if CONFIG.RECORDING_FILE_TYPES:
utils.print_bright(f'Recording file types filter is active {CONFIG.RECORDING_FILE_TYPES}')
Expand All @@ -49,16 +55,29 @@ def print_filter_warnings():
print()

def get_users():
if CONFIG.USERS:
return [(email, '') for email in CONFIG.USERS]

return paginate_reduce(
'https://api.zoom.us/v2/users?status=active', [],
lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
) + paginate_reduce(
'https://api.zoom.us/v2/users?status=inactive', [],
lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
)
if CONFIG.USERS_INCLUDE:
users = [(email, '') for email in CONFIG.USERS_INCLUDE]
else:
users = paginate_reduce(
'https://api.zoom.us/v2/users?status=active', [],
lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
) + paginate_reduce(
'https://api.zoom.us/v2/users?status=inactive', [],
lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
)

if CONFIG.VERBOSE_OUTPUT:
utils.print_dim('Found matching users:')

for user_email, user_name in users:
if user_email in CONFIG.USERS_EXCLUDE:
users.pop(users.index((user_email, user_name)))
continue

if CONFIG.VERBOSE_OUTPUT:
utils.print_dim(f'{user_name} <{user_email}>')
Comment on lines +69 to +78
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just run a filter on the list, then print it as is


return users

def paginate_reduce(url, initial, reduce):
initial_url = utils.add_url_params(url, {'page_size': 300})
Expand Down Expand Up @@ -125,15 +144,15 @@ def download_recordings(users, from_date, to_date):

for user_email, user_name in users:
user_description = get_user_description(user_email, user_name)
user_host_folder = get_user_host_folder(user_email)
host_folder = CONFIG.OUTPUT_PATH

utils.print_bright(
f'Downloading recordings from user {user_description} - Starting at {date_to_str(from_date)} '
f'and up to {date_to_str(to_date)} (inclusive).'
)

meetings = get_meetings(get_meeting_uuids(user_email, from_date, to_date))
user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, user_host_folder)
user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, host_folder, user_email)

utils.print_bright('######################################################################')
print()
Expand All @@ -147,12 +166,6 @@ def download_recordings(users, from_date, to_date):
def get_user_description(user_email, user_name):
return f'{user_email} ({user_name})' if (user_name) else user_email

def get_user_host_folder(user_email):
if CONFIG.GROUP_BY_USER:
return os.path.join(CONFIG.OUTPUT_PATH, user_email)
else:
return CONFIG.OUTPUT_PATH

def date_to_str(date):
return date.strftime('%Y-%m-%d')

Expand Down Expand Up @@ -180,6 +193,8 @@ def get_meeting_uuids(user_email, start_date, end_date):
local_start_date = local_end_date + datetime.timedelta(days=1)
progress_bar.update(1)

utils.print_dim(f"Meetings found: {len(meeting_uuids)}")

return meeting_uuids

def get_meetings(meeting_uuids):
Expand All @@ -189,14 +204,23 @@ def get_meetings(meeting_uuids):
url = f'https://api.zoom.us/v2/meetings/{utils.double_encode(meeting_uuid)}/recordings'
meetings.append(get_with_token(lambda t: requests.get(url=url, headers=get_headers(t))).json())

utils.print_dim(f"Recordings found: {len(meetings)}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't the number of recordings
it's more complicated than that


return meetings

def download_recordings_from_meetings(meetings, host_folder):
def download_recordings_from_meetings(meetings, host_folder, user_email):
file_count, total_size, skipped_count = 0, 0, 0

for meeting in meetings:
if CONFIG.TOPICS and meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS:
continue
if CONFIG.TOPICS and meeting['topic']:
if CONFIG.PARTIAL_MATCH_TOPICS:
topic_lower = str.lower(meeting['topic'])
topic_lower_slug = utils.slugify(meeting['topic'])
Comment on lines +217 to +218
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put outside the if statement

if not any(topic in topic_lower for topic in CONFIG.TOPICS) and not any(topic in topic_lower_slug for topic in CONFIG.TOPICS):
continue
else:
if meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS:
continue

recording_files = meeting.get('recording_files') or []
participant_audio_files = meeting.get('participant_audio_files') or [] if CONFIG.INCLUDE_PARTICIPANT_AUDIO else []
Expand All @@ -210,30 +234,52 @@ def download_recordings_from_meetings(meetings, host_folder):

url = recording_file['download_url']
topic = utils.slugify(meeting['topic'])
ext = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1]
recording_name = utils.slugify(f'{topic}__{recording_file["recording_start"]}')
file_id = recording_file['id']
file_name_suffix = os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else ''
recording_type_suffix = recording_file["recording_type"] + '__' if 'recording_type' in recording_file else ''
file_name = utils.slugify(
f'{recording_name}__{recording_type_suffix}{file_name_suffix}{file_id[-8:]}'
) + '.' + ext
recording_name = utils.slugify(f'{topic}')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

utils.slugify(topic)


file_name = build_file_name(recording_file, topic)
file_size = int(recording_file.get('file_size'))

if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name):
if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name, recording_file["recording_start"], user_email):
total_size += file_size
file_count += 1
else:
skipped_count += 1

return file_count, total_size, skipped_count

def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name):
def build_file_name(recording_file, topic):
recording_name = utils.slugify(f'{topic}')
recording_start = utils.slugify(f'{recording_file["recording_start"]}')
file_id = recording_file['id'][-8:]
file_name_suffix = os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else ''
recording_type_suffix = ''

recording_type_suffix = recording_file["recording_type"] if 'recording_type' in recording_file else ''
file_extension = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1]

file_name_pieces = []
for format in CONFIG.FILE_NAME_FORMAT:
if format == "RECORDING_START_DATETIME":
file_name_pieces.append(f'{recording_start}')
if format == "RECORDING_NAME":
file_name_pieces.append(f'{recording_name}{file_name_suffix}')
if format == "RECORDING_TYPE":
file_name_pieces.append(f'{recording_type_suffix}')
if format == "FILE_ID":
file_name_pieces.append(f'{file_id}')

file_name = utils.slugify(f'{CONFIG.FILE_NAME_SEPERATOR}'.join(file_name_pieces)) + '.' + file_extension

return file_name

def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name, recording_start, user_email):
folder_path = create_folder_path(host_folder, topic, recording_name, recording_start, user_email)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would call this outside this method

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to pass all these parameters to the download method

file_path = os.path.join(folder_path, file_name)

if CONFIG.VERBOSE_OUTPUT:
print()
utils.print_dim(f'URL: {download_url}')

file_path = create_path(host_folder, file_name, topic, recording_name)
utils.print_dim(f'Folder: {folder_path}')

if os.path.exists(file_path) and abs(os.path.getsize(file_path) - file_size) <= CONFIG.FILE_SIZE_MISMATCH_TOLERANCE:
utils.print_dim(f'Skipping existing file: {file_name}')
Expand All @@ -257,16 +303,25 @@ def download_recording_file(download_url, host_folder, file_name, file_size, top

return True

def create_path(host_folder, file_name, topic, recording_name):
def create_folder_path(host_folder, topic, recording_name, recording_start, user_email):
folder_path = host_folder

if CONFIG.GROUP_BY_TOPIC:
folder_path = os.path.join(folder_path, topic)
for group_by in CONFIG.GROUP_FOLDERS_BY:
if group_by == "YEAR_MONTH":
recording_start_date = datetime.datetime.strptime(recording_start, '%Y-%m-%dT%H:%M:%SZ')
year_month = recording_start_date.strftime('%Y-%m')
folder_path = os.path.join(folder_path, year_month)
if group_by == "USER_EMAIL":
folder_path = os.path.join(folder_path, user_email)
if group_by == "TOPIC":
folder_path = os.path.join(folder_path, topic)

if CONFIG.GROUP_BY_RECORDING:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this still here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be part of your grouping mechanism, no?

folder_path = os.path.join(folder_path, recording_name)

os.makedirs(folder_path, exist_ok=True)
return os.path.join(folder_path, file_name)

return folder_path

def do_with_token(do):
def do_as_get(token):
Expand Down