-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
subtotxt.py
178 lines (148 loc) · 6.68 KB
/
subtotxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# SRT or WEBVTT to plain Text
# Author: NebularNerd Version 2.1 (July 2024)
# https://github.com/NebularNerd/subtotxt
# Import required packages
import sys
import os
import argparse
import pkg_resources
import subprocess
import re
from pathlib import Path
# Install send2trash and charset_normalizer if missing.
# See https://pypi.org/project/Send2Trash/
# See https://github.com/Ousret/charset_normalizer
REQUIRED = {
'send2trash','charset-normalizer'
}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = REQUIRED - installed
if missing:
print('Installing missing modules, please wait a few moments. This only happens once.')
python = sys.executable
subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)
print('Done, thanks for waiting')
from send2trash import send2trash
from charset_normalizer import from_path
# Clear screen win/*nix friendly
def cls():
os.system('cls' if os.name=='nt' else 'clear')
cls()
# Setup argparse
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,description='Quickly strip SRT or WEBVTT of subtitle numbers and timestamp, then save to plain text file \nVisit https://github.com/NebularNerd/subtotxt for more information')
parser.add_argument("--file", "-f", type=str, required=True, help='Path to .srt or .vtt file, enclose in quotes if path has spaces')
parser.add_argument("--utf8", "-8", default=False, action="store_true", required=False, help='Force output file to use UTF-8 instead of input encoding')
parser.add_argument("--pause", "-p", default=False, action="store_true", required=False, help='Pauses at sanity check info to allow viewing before continuing')
parser.add_argument("--screen", "-s", default=False, action="store_true", required=False, help='Prints the conversion to the console as well as the file')
parser.add_argument("--copy", "-c", default=False, action="store_true", required=False, help='Copies input to output without change, appends -copy to filename')
parser.add_argument("--overwrite", "-o", default=False, action="store_true", required=False, help='Skips asking for permission to overwrite, will auto-delete old file and create a new one')
parser.add_argument("--oneliners", "-1", default=False, action="store_true", required=False, help='Write all sentences in one line, even if the original divides it into many lines or subtitles.')
args = parser.parse_args()
# Setup file wrangling stuff and sanity checks
ifile = Path(args.file)
ofile = ifile.with_suffix('.txt')
cfile = ifile.with_stem(f"{ifile.stem}-copy")
result = from_path(ifile).best() # charset_normalizer guess encoding
encoding = result.encoding
if result is not None and encoding == "utf_8" and result.bom:
encoding += "_sig" # adds sig for utf_8_sig/bom files
if result is not None and encoding == "utf_16" and result.bom:
encoding += "_sig" # adds sig for utf_16_sig/bom files
confidence = 1.0 - result.chaos # gives probability of match being correct
#Do stuff
print('SUB to TXT 2.0\n')
print('Input file : \n',ifile)
if args.copy:
print('Output file : \n',cfile,'\n')
deleteme = cfile
else:
print('Output file : \n',ofile,'\n')
deleteme = ofile
print('Detected Character Encoding:',encoding)
print('Confidence of encoding : {:0.2f}%'.format(confidence*100))
if args.utf8:
print('Output encoding forced to UTF-8')
encset="utf8"
else:
print('Output will use input encoding')
encset=encoding
print('\n\n')
answer = None
if args.pause:
while answer not in ("y","n"):
answer = input("Ready to start? [y/n]")
if answer == "y":
print('Starting...')
elif answer == "n":
print ("OK, bye for now...\n\n")
sys.exit()
else:
print("Please enter y or n.")
# Check for old file
answer = None
if not args.overwrite:
if deleteme.is_file():
while answer not in ("y","n"):
answer = input("Output file already exists, delete and make a new one? [y/n]")
if answer == "y":
send2trash(deleteme)
elif answer == "n":
print ("OK, bye for now...\n\n")
sys.exit()
else:
print("Please enter y or n.")
# Test File Format (in case of extension error) and set flags
webvtt = 0
srt = 0
if not args.copy:
with open(ifile, 'r', encoding=encoding) as testsub:
for line in testsub:
if "WEBVTT" in line:
webvtt = 1
elif line.strip('\n') == "1" and re.search("(.*:.*:.*-->.*:.*:.*)",next(testsub)):
srt = 1
# SRT format
if srt == 1:
with open(ifile, 'r', encoding=encoding) as original, open(ofile, 'w', encoding=encset) as new:
subnum = 1
subnumstr = str(subnum)
for line in original:
if line.strip('\n') == subnumstr and re.search("(.*:.*:.*-->.*:.*:.*)",next(original)):
subnum = subnum+1
subnumstr = str(subnum)
#Ignore SRT Subtitle # and Timecode lines
elif not line.strip('\n') == '':
if args.screen: print(line, end='')
if args.oneliners:
line = line.strip()
if line[-1] in [".", "?", "!", "…"]:
new.write(line + '\n')
else:
new.write(line + ' ')
else:
new.write(line)
# WEBVTT format
if webvtt == 1:
with open(ifile, 'r', encoding=encoding) as original, open(ofile, 'w', encoding=encset) as new:
subnum = 1
subnumstr = str(subnum)
for line in original:
if "WEBVTT" in line or re.search("^Kind:.*$",line) or re.search("^Language:.*$",line) or re.search("(.*:.*:.*-->.*:.*:.*)",line):
line = ''
if not line.strip('\n') == '':
if args.screen: print(line, end='')
if args.oneliners:
line = line.strip()
if line[-1] in [".", "?", "!", "…"]:
new.write(line + '\n')
else:
new.write(line + ' ')
else:
new.write(line)
# Copy mode
if args.copy:
with open(ifile, 'r', encoding=encoding) as original, open(cfile, 'w', encoding=encset) as new:
for line in original:
if args.screen: print(line, end='')
new.write(line)
print('\nFinished\n')