refactor

akshithg · Jun 2, 2018 · 4a8fb1a · 4a8fb1a
1 parent c5360f7
commit 4a8fb1a
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 52 deletions.
diff --git a/feature_calculator.py b/feature_calculator.py
@@ -1,5 +1,6 @@
 '''
 calculates features for a given file
+usage: python feature_calculator.py <file_with_clean_data>
 '''
 
 import sys
@@ -12,7 +13,7 @@ def usage_str():
     '''
     usage string
     '''
-    print("python feature_calculator.py file_with_clean_data")
+    print("usage: python feature_calculator.py <file_with_clean_data>")
 
 def decorator(n):
     '''
@@ -23,13 +24,15 @@ class FeatureCalculator:
     def __init__(self, source_file):
         self.source_file = source_file
         self.data = pd.read_csv(source_file, skip_blank_lines=True)
+        self.set_sequence_column()
         self.data.index.name = 'seq_no'
-        self.SEQ = 'seq'
+        self.non_feature_columns = self.data.columns.tolist()
 
-
-    def set_sequence_column(self, seq):
+    def set_sequence_column(self, seq="seq"):
         self.SEQ = seq
 
+    def feature_columns(self):
+        return list(set(self.data.columns.tolist()) - set(self.non_feature_columns))
 
     def save_features(self, columns=[]):
         '''
@@ -40,13 +43,16 @@ def save_features(self, columns=[]):
 
         feature_file = self.source_file+"._features"
         self.data.to_csv(feature_file, sep=',', columns=columns)
+        print("saving features: "+str(columns))
         print("Saved to " + feature_file)
 
 
     def gc_content(self):
         '''
         feature: adds gc_content attribute
         '''
+        print ("calculating GC content")
+
         def calc(seq):
             g = seq.count('G')
             c = seq.count('C')
@@ -60,6 +66,8 @@ def tataaa_box_present(self):
         '''
         feature: adds tataaa box attribute
         '''
+        print ("calculating TATAAA box")
+
         def calc(seq):
             return int('TATAA' in seq)
 
@@ -71,6 +79,8 @@ def gc_box(self):
         '''
         feature: adds gc box attribute: CCAAT and GGGCGG
         '''
+        print ("calculating GC box")
+
         def calc(seq):
             gc = ['CCAAT', 'GGGCGG']
             return int(any(s in gc for s in seq))
@@ -83,6 +93,8 @@ def poly_a_tail(self, n=3):
         '''
         3 or more As in last 36 nucleotide
         '''
+        print ("calculating poly A tail")
+
         def calc(seq):
             if len(seq) < 36:
                 return 0
@@ -101,6 +113,8 @@ def stop_codon_present(self):
         '''
         Checks of  on stop codons TAA TGA TAG
         '''
+        print ("calculating stop codons")
+
         def calc(seq):
             STOP = ["TAA", "TGA", "TAG"]
             # check in any of the stop codons are in the seq
@@ -110,26 +124,60 @@ def calc(seq):
         return self.data
 
 
+    def sequence_length(self):
+        '''
+        calculates length of the seq
+        '''
+        print("calcualting sequence length")
+
+        def calc(seq):
+            return len(seq)
+
+        self.data['seq_length'] = self.data[self.SEQ].apply(calc)
+        return self.data
+
+
     def feature_template(self):
         '''
         feature: adds new attribute
         '''
+        print("calculating ... ? ")
         def calc(seq):
             # CHANGE THIS
             # val = calculate feature value on seq
+            val = 0
             return val
 
         self.data['CHANGE_THIS_to_your_feature_name'] = self.data[self.SEQ].apply(calc)
         return self.data
 
 
 ## starts here
+decorator(25)
 
 if(len(sys.argv) != 2):
     usage_str()
 else:
     feature_calculator = FeatureCalculator(sys.argv[1])
-    decorator(25)
-    print("`feature_calculator` object created, you can now interact with it")
-    decorator(25)
-    code.interact(local=dict(globals(), **locals()))
+
+    # # interactive run
+    # decorator(25)
+    # print("`feature_calculator` object created, you can now interact with it")
+    # decorator(25)
+    # code.interact(local=dict(globals(), **locals()))
+
+
+    # calculate features
+    feature_calculator.gc_content()
+    feature_calculator.tataaa_box_present()
+    feature_calculator.gc_box()
+    feature_calculator.poly_a_tail()
+    feature_calculator.stop_codon_present()
+    feature_calculator.sequence_length()
+
+    # save features to file
+    feature_calculator.save_features(feature_calculator.feature_columns())
+
+    print("DONE")
+
+decorator(25)
diff --git a/file_splitter.py b/file_splitter.py
@@ -1,48 +1,54 @@
 '''
 file chop chop
-usage: python file_splitter.py file NUMBER_OF_CHOPS
+usage: python file_splitter.py <file> <splits>
 '''
 
 import sys
+import os
 
 def usage_str():
     '''
     usage string
     '''
-    print("usage: python file_splitter.py file number_of_parts")
+    print("usage: python file_splitter.py <file> <splits>")
 
 def decorator(n):
     '''
     '''
     print("*"*n)
 
-def split_file(path, chops):
+def split_file(path, splits):
     '''
-    splits file into n chops
+    splits file
     '''
-    file = open(path)
-
+    input_file = open(path)
     # read header line
-    header = file.readline()
-    # read resst of the data
-    data = file.readlines()
-
+    header = input_file.readline()
+    # read rest of the data
+    data = input_file.readlines()
     # total lines
     total_lines = len(data)
     # calculate line per chop
-    lines_per_chop = int(total_lines / chops) + (total_lines % chops > 0)
+    lines_per_split = int(total_lines / splits) + (total_lines % splits > 0)
+
+    # create output dir
+    output_path = path + ".splits"
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    # write into each split
     print("Part files:")
-    for i in range(chops):
+    for i in range(splits):
         # create a part file
-        part_file = path+".part"+str(i+1)
+        part_file = output_path+"/part"+str(i+1)
         print(part_file)
         part_file = open(part_file, "w")
 
         # write header
         part_file.write(header)
 
         # write data
-        for line in data[i*lines_per_chop : i*lines_per_chop + lines_per_chop]:
+        for line in data[i*lines_per_split : i*lines_per_split + lines_per_split]:
             part_file.write(line)
 
         # close file
@@ -66,5 +72,6 @@ def split_file(path, chops):
     else:
         split_file(sys.argv[1], chops)
 
+    print("DONE")
+
 decorator(25)
-print("DONE")
diff --git a/merge_feature_file.py b/merge_feature_file.py
@@ -1,35 +1,44 @@
 '''
 file join
-usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile
+usage: python merge_features.py directory output_file
 '''
 
 import sys
+import os
 
 def usage_str():
     '''
     usage string
     '''
-    print("usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile")
+    print("usage: python merge_features.py <directory> <output_file>")
 
 def decorator(n):
     '''
     '''
     print("*"*n)
 
-def join_files(input_files, output_file):
+def join_files(directory, output_file):
     '''
     join all input_files into one output_file
     '''
-    print("writing to :"+output_file)
+    # Files in directory
+    print("files in the directory:")
+    input_files = []
+    for file in os.listdir(directory):
+        file = directory + "/" + file
+        print(file)
+        input_files.append(file)
+
+    print("\nwriting to :"+output_file)
     output_file = open(output_file, "w")
 
     write_header = False
     for file in input_files:
-        print("reading from: "+file)
         file = open(file)
         header = file.readline()
         data = file.readlines()
 
+        # write header once
         if not write_header:
             output_file.write(header)
             write_header = True
@@ -44,16 +53,14 @@ def join_files(input_files, output_file):
     output_file.close()
 
 
-
-
 # **** Starts here ****
 decorator(25)
 
 # read file and p from args
 if(len(sys.argv) < 3):
     usage_str()
 else:
-    join_files(sys.argv[1:-1], sys.argv[-1])
+    join_files(sys.argv[1], sys.argv[2])
+    print("DONE")
 
 decorator(25)
-print("DONE")
diff --git a/process_fasta.py b/process_fasta.py
@@ -1,12 +1,19 @@
 '''
 clean and format fasta files
-usage: python pocess_fasta.py file1 [file2] ... [fileN]
+default output: <input_file_path>.clean
+usage: python pocess_fasta.py <fasta_file> [output_file]
 '''
 
 import sys
 
 COLUMNS = ['gene_stable_id', 'transcript_stable_id', 'seq']
 
+def usage_str():
+    '''
+    usage string
+    '''
+    print("usage: python pocess_fasta.py <fasta_file> [output_file]")
+
 def decorator(n):
     '''
     '''
@@ -25,48 +32,51 @@ def format_data_line(line):
     data_line = line[:15] + "," + line[15:30] + "," + line[30:]
     return data_line
 
-def fasta_format(path):
+def fasta_format(path, output_file):
     '''
     cleans and formats file and writes to file.clean
     removes Sequence unavailable lines too
     '''
+    print("formatting file: "+path)
     # create a file handler
-    file = open(path, 'r')
+    input_file = open(path, 'r')
 
     # read, replace new line, split at >
-    lines = file.read()\
-                .replace("\n", "")\
-                .replace("|", "")\
-                .split(">")[1:]
+    lines = input_file.read()\
+                      .replace("\n", "")\
+                      .replace("|", "")\
+                      .split(">")[1:]
 
     # close file
-    file.close()
+    input_file.close()
 
-    # write to a new file
-    new_file = open(path+".clean", "w")
+    # write to output file
+    print("writing to: "+output_file)
+    output_file = open(output_file, "w")
 
     # write header line
-    new_file.write(header_line()+"\n")
+    output_file.write(header_line()+"\n")
     # write data lines
     for line in lines:
         if(not "Sequence unavailable" in line):
-            new_file.write(format_data_line(line) + "\n")
+            output_file.write(format_data_line(line) + "\n")
 
     # flush and close file.clean
-    new_file.flush()
-    new_file.close()
+    output_file.flush()
+    output_file.close()
 
 
 # **** Starts here ****
 decorator(25)
 # read file from args
 if(len(sys.argv) < 2):
-    print("usage: python pocess_fasta.py file1 [file2] ... [fileN]")
+    usage_str()
 else:
-    for file in (sys.argv[1:]):
-        # formats file
-        print("formatting file: "+file)
-        fasta_format(file)
+    try:
+        fasta_format(path=sys.argv[1], output_file=sys.argv[2])
+    except:
+        fasta_format(path=sys.argv[1], output_file=sys.argv[1]+".clean")
+
+    print("DONE")
 
 decorator(25)
-print("DONE")