lanec · PancakeTornado · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
-.venv
+.venv
 config.py
 __pycache__
+.DS_Store
+.vscode
diff --git a/config_template.py b/config_template.py
@@ -8,10 +8,16 @@
 
 # Date range (inclusive) for downloads, None value for Days gets replaced by first/last day of the month.
 START_DAY, START_MONTH, START_YEAR = None, 5, 2020
-END_DAY, END_MONTH, END_YEAR = None , 3, 2022
+END_DAY,   END_MONTH,   END_YEAR   = None, 3, 2022
 
 # Put here emails of the users you want to check for recordings. If empty, all users under the account will be checked.
-USERS = [
+USERS_INCLUDE = [
+    # R"####@####.####",
+    # R"####@####.####",
+]
+
+# Put here emails of the users you want to exclude from checking for recordings. Optional.
+USERS_EXCLUDE = [
     # R"####@####.####",
     # R"####@####.####",
 ]
@@ -22,6 +28,9 @@
     # R"############",
 ]
 
+# If True, topics that partially match your topic filters are downloaded. If False, only meetings with exact topic matches are downloaded.
+PARTIAL_MATCH_TOPICS = False
+
 # Put here the file types you wish to download. If empty, no file type filtering will happen.
 RECORDING_FILE_TYPES = [
     # R"MP4",            # Video file of the recording.
@@ -32,11 +41,13 @@
     # R"SUMMARY",        # Summary file of the recording in JSON file format.
 ]
 
-# If True, recordings will be grouped in folders by their owning user.
-GROUP_BY_USER = True
-
-# If True, recordings will be grouped in folders by their topics
-GROUP_BY_TOPIC = True
+# Group records in a folder hierarchy using the order below.
+# Reorder or comment out any of the folder groups below to control the folder hierarchy created to orgainze the downloaded recording files.
+GROUP_FOLDERS_BY = [
+    # R"YEAR_MONTH",     # Recordings will be grouped in folders by their recording start date in yyyy-mm format.
+     R"USER_EMAIL",     # Recordings will be grouped in folders by their owning user's email address.
+     R"TOPIC",          # Recordings will be grouped in folders by their topics.
+]
 
 # If True, each instance of recording will be in its own folder (which may contain multiple files).
 # Note: One "meeting" can have multiple recording instances.
@@ -46,6 +57,18 @@
 # This works when "Record a separate audio file of each participant" is enabled.
 INCLUDE_PARTICIPANT_AUDIO = True
 
+# Recording file name format to use when saving files. Reorder or comment out any file name format pieces below to control the file naming pattern.
+# Example: 2023-12-25t143021z__name-of-the-meeting__audio_transcript__ff625374.VTT
+FILE_NAME_FORMAT = [
+     R"RECORDING_START_DATETIME",   # Recording start datetime
+     R"RECORDING_NAME",             # Recording name
+     R"RECORDING_TYPE",             # Recoding type
+     R"FILE_ID",                    # Recording unique file ID
+]
+
+# Seperator character(s) to place in between the file name format pieces when building the recording file names.
+FILE_NAME_SEPERATOR = "__"
+
 # Set to True for more verbose output
 VERBOSE_OUTPUT = False
 

diff --git a/zoom_batch_downloader.py b/zoom_batch_downloader.py
@@ -17,6 +17,9 @@ def main():
 
 	print_filter_warnings()
 
+	if CONFIG.PARTIAL_MATCH_TOPICS:
+		CONFIG.TOPICS = [topic.lower() for topic in CONFIG.TOPICS]
+
 	from_date = datetime.datetime(CONFIG.START_YEAR, CONFIG.START_MONTH, CONFIG.START_DAY or 1)
 	to_date = datetime.datetime(
 		CONFIG.END_YEAR, CONFIG.END_MONTH, CONFIG.END_DAY or monthrange(CONFIG.END_YEAR, CONFIG.END_MONTH)[1],
@@ -38,8 +41,11 @@ def print_filter_warnings():
 	if CONFIG.TOPICS:
 		utils.print_bright(f'Topics filter is active {CONFIG.TOPICS}')
 		did_print = True
-	if CONFIG.USERS:
-		utils.print_bright(f'Users filter is active {CONFIG.USERS}')
+	if CONFIG.USERS_INCLUDE:
+		utils.print_bright(f'Users include filter is active {CONFIG.USERS_INCLUDE}')
+		did_print = True
+	if CONFIG.USERS_EXCLUDE:
+		utils.print_bright(f'Users exclude filter is active {CONFIG.USERS_EXCLUDE}')
 		did_print = True
 	if CONFIG.RECORDING_FILE_TYPES:
 		utils.print_bright(f'Recording file types filter is active {CONFIG.RECORDING_FILE_TYPES}')
@@ -49,16 +55,29 @@ def print_filter_warnings():
 		print()
 
 def get_users():
-	if CONFIG.USERS:
-		return [(email, '') for email in CONFIG.USERS]
-
-	return paginate_reduce(
-		'https://api.zoom.us/v2/users?status=active', [],
-		lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
-	) + paginate_reduce(
-		'https://api.zoom.us/v2/users?status=inactive', [],
-		lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
-	)
+	if CONFIG.USERS_INCLUDE:
+		users = [(email, '') for email in CONFIG.USERS_INCLUDE]
+	else:
+		users = paginate_reduce(
+			'https://api.zoom.us/v2/users?status=active', [],
+			lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
+		) + paginate_reduce(
+			'https://api.zoom.us/v2/users?status=inactive', [],
+			lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']]
+		)
+
+	if CONFIG.VERBOSE_OUTPUT:
+		utils.print_dim('Found matching users:')
+
+	for user_email, user_name in users:
+		if user_email in CONFIG.USERS_EXCLUDE:
+			users.pop(users.index((user_email, user_name)))
+			continue
+
+		if CONFIG.VERBOSE_OUTPUT:
+			utils.print_dim(f'{user_name} <{user_email}>')
+
+	return users
 
 def paginate_reduce(url, initial, reduce):
 	initial_url = utils.add_url_params(url, {'page_size': 300})
@@ -125,15 +144,15 @@ def download_recordings(users, from_date, to_date):
 
 	for user_email, user_name in users:
 		user_description = get_user_description(user_email, user_name)
-		user_host_folder = get_user_host_folder(user_email)
+		host_folder = CONFIG.OUTPUT_PATH
 
 		utils.print_bright(
 			f'Downloading recordings from user {user_description} - Starting at {date_to_str(from_date)} '
 			f'and up to {date_to_str(to_date)} (inclusive).'
 		)
 
 		meetings = get_meetings(get_meeting_uuids(user_email, from_date, to_date))
-		user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, user_host_folder)
+		user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, host_folder, user_email)
 
 		utils.print_bright('######################################################################')
 		print()
@@ -147,12 +166,6 @@ def download_recordings(users, from_date, to_date):
 def get_user_description(user_email, user_name):
 	return f'{user_email} ({user_name})' if (user_name) else user_email
 
-def get_user_host_folder(user_email):
-	if CONFIG.GROUP_BY_USER:
-		return os.path.join(CONFIG.OUTPUT_PATH, user_email)
-	else:
-		return CONFIG.OUTPUT_PATH
-
 def date_to_str(date):
 	return date.strftime('%Y-%m-%d')
 
@@ -180,6 +193,8 @@ def get_meeting_uuids(user_email, start_date, end_date):
 			local_start_date = local_end_date + datetime.timedelta(days=1)
 			progress_bar.update(1)
 
+	utils.print_dim(f"Meetings found: {len(meeting_uuids)}")
+
 	return meeting_uuids
 
 def get_meetings(meeting_uuids):
@@ -189,14 +204,23 @@ def get_meetings(meeting_uuids):
 		url = f'https://api.zoom.us/v2/meetings/{utils.double_encode(meeting_uuid)}/recordings'
 		meetings.append(get_with_token(lambda t: requests.get(url=url, headers=get_headers(t))).json())
 
+	utils.print_dim(f"Recordings found: {len(meetings)}")
+
 	return meetings
 
-def download_recordings_from_meetings(meetings, host_folder):
+def download_recordings_from_meetings(meetings, host_folder, user_email):
 	file_count, total_size, skipped_count = 0, 0, 0
 
 	for meeting in meetings:
-		if CONFIG.TOPICS and meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS:
-			continue
+		if CONFIG.TOPICS and meeting['topic']:
+			if CONFIG.PARTIAL_MATCH_TOPICS:
+				topic_lower = str.lower(meeting['topic'])
+				topic_lower_slug = utils.slugify(meeting['topic'])
+				if not any(topic in topic_lower for topic in CONFIG.TOPICS) and not any(topic in topic_lower_slug for topic in CONFIG.TOPICS):
+					continue
+			else:
+				if meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS:
+					continue
 
 		recording_files = meeting.get('recording_files') or []
 		participant_audio_files = meeting.get('participant_audio_files') or [] if CONFIG.INCLUDE_PARTICIPANT_AUDIO else []
@@ -210,30 +234,52 @@ def download_recordings_from_meetings(meetings, host_folder):
 
 			url = recording_file['download_url']
 			topic = utils.slugify(meeting['topic'])
-			ext = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1]
-			recording_name = utils.slugify(f'{topic}__{recording_file["recording_start"]}')
-			file_id = recording_file['id']
-			file_name_suffix =  os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else ''
-			recording_type_suffix =  recording_file["recording_type"] + '__' if 'recording_type' in recording_file else ''
-			file_name = utils.slugify(
-				f'{recording_name}__{recording_type_suffix}{file_name_suffix}{file_id[-8:]}'
-			) + '.' + ext
+			recording_name = utils.slugify(f'{topic}')
+
+			file_name = build_file_name(recording_file, topic)
 			file_size = int(recording_file.get('file_size'))
 
-			if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name):
+			if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name, recording_file["recording_start"], user_email):
 				total_size += file_size
 				file_count += 1
 			else:
 				skipped_count += 1
 
 	return file_count, total_size, skipped_count
 
-def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name):
+def build_file_name(recording_file, topic):
+	recording_name = utils.slugify(f'{topic}')
+	recording_start = utils.slugify(f'{recording_file["recording_start"]}')
+	file_id = recording_file['id'][-8:]
+	file_name_suffix =  os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else ''
+	recording_type_suffix = ''
+
+	recording_type_suffix =  recording_file["recording_type"] if 'recording_type' in recording_file else ''
+	file_extension = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1]
+
+	file_name_pieces = []
+	for format in CONFIG.FILE_NAME_FORMAT:
+		if format == "RECORDING_START_DATETIME":
+			file_name_pieces.append(f'{recording_start}')
+		if format == "RECORDING_NAME":
+			file_name_pieces.append(f'{recording_name}{file_name_suffix}')
+		if format == "RECORDING_TYPE":
+			file_name_pieces.append(f'{recording_type_suffix}')
+		if format == "FILE_ID":
+			file_name_pieces.append(f'{file_id}')
+
+	file_name = utils.slugify(f'{CONFIG.FILE_NAME_SEPERATOR}'.join(file_name_pieces)) + '.' + file_extension
+
+	return file_name
+
+def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name, recording_start, user_email):
+	folder_path = create_folder_path(host_folder, topic, recording_name, recording_start, user_email)
+	file_path = os.path.join(folder_path, file_name)
+
 	if CONFIG.VERBOSE_OUTPUT:
 		print()
 		utils.print_dim(f'URL: {download_url}')
-
-	file_path = create_path(host_folder, file_name, topic, recording_name)
+		utils.print_dim(f'Folder: {folder_path}')
 
 	if os.path.exists(file_path) and abs(os.path.getsize(file_path) - file_size) <= CONFIG.FILE_SIZE_MISMATCH_TOLERANCE:
 		utils.print_dim(f'Skipping existing file: {file_name}')
@@ -257,16 +303,25 @@ def download_recording_file(download_url, host_folder, file_name, file_size, top
 
 	return True
 
-def create_path(host_folder, file_name, topic, recording_name):
+def create_folder_path(host_folder, topic, recording_name, recording_start, user_email):
 	folder_path = host_folder
 
-	if CONFIG.GROUP_BY_TOPIC:
-		folder_path = os.path.join(folder_path, topic)
+	for group_by in CONFIG.GROUP_FOLDERS_BY:
+		if group_by == "YEAR_MONTH":
+			recording_start_date = datetime.datetime.strptime(recording_start, '%Y-%m-%dT%H:%M:%SZ')
+			year_month = recording_start_date.strftime('%Y-%m')
+			folder_path = os.path.join(folder_path, year_month)
+		if group_by == "USER_EMAIL":
+			folder_path = os.path.join(folder_path, user_email)
+		if group_by == "TOPIC":
+			folder_path = os.path.join(folder_path, topic)
+
 	if CONFIG.GROUP_BY_RECORDING:
 		folder_path = os.path.join(folder_path, recording_name)
 
 	os.makedirs(folder_path, exist_ok=True)
-	return os.path.join(folder_path, file_name)
+
+	return folder_path
 
 def do_with_token(do):
 	def do_as_get(token):