From 4a8fb1ae9aa9de437d2ac5a60862100b893a5154 Mon Sep 17 00:00:00 2001 From: Akshith Gunasekaran Date: Fri, 1 Jun 2018 18:52:53 -0700 Subject: [PATCH] refactor --- feature_calculator.py | 64 +++++++++++++++++++++++++++++++++++++------ file_splitter.py | 37 +++++++++++++++---------- merge_feature_file.py | 25 +++++++++++------ process_fasta.py | 50 +++++++++++++++++++-------------- 4 files changed, 124 insertions(+), 52 deletions(-) diff --git a/feature_calculator.py b/feature_calculator.py index a4ccc35..32c0484 100644 --- a/feature_calculator.py +++ b/feature_calculator.py @@ -1,5 +1,6 @@ ''' calculates features for a given file +usage: python feature_calculator.py ''' import sys @@ -12,7 +13,7 @@ def usage_str(): ''' usage string ''' - print("python feature_calculator.py file_with_clean_data") + print("usage: python feature_calculator.py ") def decorator(n): ''' @@ -23,13 +24,15 @@ class FeatureCalculator: def __init__(self, source_file): self.source_file = source_file self.data = pd.read_csv(source_file, skip_blank_lines=True) + self.set_sequence_column() self.data.index.name = 'seq_no' - self.SEQ = 'seq' + self.non_feature_columns = self.data.columns.tolist() - - def set_sequence_column(self, seq): + def set_sequence_column(self, seq="seq"): self.SEQ = seq + def feature_columns(self): + return list(set(self.data.columns.tolist()) - set(self.non_feature_columns)) def save_features(self, columns=[]): ''' @@ -40,6 +43,7 @@ def save_features(self, columns=[]): feature_file = self.source_file+"._features" self.data.to_csv(feature_file, sep=',', columns=columns) + print("saving features: "+str(columns)) print("Saved to " + feature_file) @@ -47,6 +51,8 @@ def gc_content(self): ''' feature: adds gc_content attribute ''' + print ("calculating GC content") + def calc(seq): g = seq.count('G') c = seq.count('C') @@ -60,6 +66,8 @@ def tataaa_box_present(self): ''' feature: adds tataaa box attribute ''' + print ("calculating TATAAA box") + def calc(seq): return int('TATAA' in seq) @@ -71,6 +79,8 @@ def gc_box(self): ''' feature: adds gc box attribute: CCAAT and GGGCGG ''' + print ("calculating GC box") + def calc(seq): gc = ['CCAAT', 'GGGCGG'] return int(any(s in gc for s in seq)) @@ -83,6 +93,8 @@ def poly_a_tail(self, n=3): ''' 3 or more As in last 36 nucleotide ''' + print ("calculating poly A tail") + def calc(seq): if len(seq) < 36: return 0 @@ -101,6 +113,8 @@ def stop_codon_present(self): ''' Checks of on stop codons TAA TGA TAG ''' + print ("calculating stop codons") + def calc(seq): STOP = ["TAA", "TGA", "TAG"] # check in any of the stop codons are in the seq @@ -110,13 +124,28 @@ def calc(seq): return self.data + def sequence_length(self): + ''' + calculates length of the seq + ''' + print("calcualting sequence length") + + def calc(seq): + return len(seq) + + self.data['seq_length'] = self.data[self.SEQ].apply(calc) + return self.data + + def feature_template(self): ''' feature: adds new attribute ''' + print("calculating ... ? ") def calc(seq): # CHANGE THIS # val = calculate feature value on seq + val = 0 return val self.data['CHANGE_THIS_to_your_feature_name'] = self.data[self.SEQ].apply(calc) @@ -124,12 +153,31 @@ def calc(seq): ## starts here +decorator(25) if(len(sys.argv) != 2): usage_str() else: feature_calculator = FeatureCalculator(sys.argv[1]) - decorator(25) - print("`feature_calculator` object created, you can now interact with it") - decorator(25) - code.interact(local=dict(globals(), **locals())) + + # # interactive run + # decorator(25) + # print("`feature_calculator` object created, you can now interact with it") + # decorator(25) + # code.interact(local=dict(globals(), **locals())) + + + # calculate features + feature_calculator.gc_content() + feature_calculator.tataaa_box_present() + feature_calculator.gc_box() + feature_calculator.poly_a_tail() + feature_calculator.stop_codon_present() + feature_calculator.sequence_length() + + # save features to file + feature_calculator.save_features(feature_calculator.feature_columns()) + + print("DONE") + +decorator(25) diff --git a/file_splitter.py b/file_splitter.py index d78b103..95965a4 100644 --- a/file_splitter.py +++ b/file_splitter.py @@ -1,40 +1,46 @@ ''' file chop chop -usage: python file_splitter.py file NUMBER_OF_CHOPS +usage: python file_splitter.py ''' import sys +import os def usage_str(): ''' usage string ''' - print("usage: python file_splitter.py file number_of_parts") + print("usage: python file_splitter.py ") def decorator(n): ''' ''' print("*"*n) -def split_file(path, chops): +def split_file(path, splits): ''' - splits file into n chops + splits file ''' - file = open(path) - + input_file = open(path) # read header line - header = file.readline() - # read resst of the data - data = file.readlines() - + header = input_file.readline() + # read rest of the data + data = input_file.readlines() # total lines total_lines = len(data) # calculate line per chop - lines_per_chop = int(total_lines / chops) + (total_lines % chops > 0) + lines_per_split = int(total_lines / splits) + (total_lines % splits > 0) + + # create output dir + output_path = path + ".splits" + if not os.path.exists(output_path): + os.makedirs(output_path) + + # write into each split print("Part files:") - for i in range(chops): + for i in range(splits): # create a part file - part_file = path+".part"+str(i+1) + part_file = output_path+"/part"+str(i+1) print(part_file) part_file = open(part_file, "w") @@ -42,7 +48,7 @@ def split_file(path, chops): part_file.write(header) # write data - for line in data[i*lines_per_chop : i*lines_per_chop + lines_per_chop]: + for line in data[i*lines_per_split : i*lines_per_split + lines_per_split]: part_file.write(line) # close file @@ -66,5 +72,6 @@ def split_file(path, chops): else: split_file(sys.argv[1], chops) + print("DONE") + decorator(25) -print("DONE") diff --git a/merge_feature_file.py b/merge_feature_file.py index eac2524..71fafd7 100644 --- a/merge_feature_file.py +++ b/merge_feature_file.py @@ -1,35 +1,44 @@ ''' file join -usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile +usage: python merge_features.py directory output_file ''' import sys +import os def usage_str(): ''' usage string ''' - print("usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile") + print("usage: python merge_features.py ") def decorator(n): ''' ''' print("*"*n) -def join_files(input_files, output_file): +def join_files(directory, output_file): ''' join all input_files into one output_file ''' - print("writing to :"+output_file) + # Files in directory + print("files in the directory:") + input_files = [] + for file in os.listdir(directory): + file = directory + "/" + file + print(file) + input_files.append(file) + + print("\nwriting to :"+output_file) output_file = open(output_file, "w") write_header = False for file in input_files: - print("reading from: "+file) file = open(file) header = file.readline() data = file.readlines() + # write header once if not write_header: output_file.write(header) write_header = True @@ -44,8 +53,6 @@ def join_files(input_files, output_file): output_file.close() - - # **** Starts here **** decorator(25) @@ -53,7 +60,7 @@ def join_files(input_files, output_file): if(len(sys.argv) < 3): usage_str() else: - join_files(sys.argv[1:-1], sys.argv[-1]) + join_files(sys.argv[1], sys.argv[2]) + print("DONE") decorator(25) -print("DONE") diff --git a/process_fasta.py b/process_fasta.py index bd9c4a6..9c57f3d 100644 --- a/process_fasta.py +++ b/process_fasta.py @@ -1,12 +1,19 @@ ''' clean and format fasta files -usage: python pocess_fasta.py file1 [file2] ... [fileN] +default output: .clean +usage: python pocess_fasta.py [output_file] ''' import sys COLUMNS = ['gene_stable_id', 'transcript_stable_id', 'seq'] +def usage_str(): + ''' + usage string + ''' + print("usage: python pocess_fasta.py [output_file]") + def decorator(n): ''' ''' @@ -25,48 +32,51 @@ def format_data_line(line): data_line = line[:15] + "," + line[15:30] + "," + line[30:] return data_line -def fasta_format(path): +def fasta_format(path, output_file): ''' cleans and formats file and writes to file.clean removes Sequence unavailable lines too ''' + print("formatting file: "+path) # create a file handler - file = open(path, 'r') + input_file = open(path, 'r') # read, replace new line, split at > - lines = file.read()\ - .replace("\n", "")\ - .replace("|", "")\ - .split(">")[1:] + lines = input_file.read()\ + .replace("\n", "")\ + .replace("|", "")\ + .split(">")[1:] # close file - file.close() + input_file.close() - # write to a new file - new_file = open(path+".clean", "w") + # write to output file + print("writing to: "+output_file) + output_file = open(output_file, "w") # write header line - new_file.write(header_line()+"\n") + output_file.write(header_line()+"\n") # write data lines for line in lines: if(not "Sequence unavailable" in line): - new_file.write(format_data_line(line) + "\n") + output_file.write(format_data_line(line) + "\n") # flush and close file.clean - new_file.flush() - new_file.close() + output_file.flush() + output_file.close() # **** Starts here **** decorator(25) # read file from args if(len(sys.argv) < 2): - print("usage: python pocess_fasta.py file1 [file2] ... [fileN]") + usage_str() else: - for file in (sys.argv[1:]): - # formats file - print("formatting file: "+file) - fasta_format(file) + try: + fasta_format(path=sys.argv[1], output_file=sys.argv[2]) + except: + fasta_format(path=sys.argv[1], output_file=sys.argv[1]+".clean") + + print("DONE") decorator(25) -print("DONE")