Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
akshithg committed Jun 2, 2018
1 parent c5360f7 commit 4a8fb1a
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 52 deletions.
64 changes: 56 additions & 8 deletions feature_calculator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
calculates features for a given file
usage: python feature_calculator.py <file_with_clean_data>
'''

import sys
Expand All @@ -12,7 +13,7 @@ def usage_str():
'''
usage string
'''
print("python feature_calculator.py file_with_clean_data")
print("usage: python feature_calculator.py <file_with_clean_data>")

def decorator(n):
'''
Expand All @@ -23,13 +24,15 @@ class FeatureCalculator:
def __init__(self, source_file):
self.source_file = source_file
self.data = pd.read_csv(source_file, skip_blank_lines=True)
self.set_sequence_column()
self.data.index.name = 'seq_no'
self.SEQ = 'seq'
self.non_feature_columns = self.data.columns.tolist()


def set_sequence_column(self, seq):
def set_sequence_column(self, seq="seq"):
self.SEQ = seq

def feature_columns(self):
return list(set(self.data.columns.tolist()) - set(self.non_feature_columns))

def save_features(self, columns=[]):
'''
Expand All @@ -40,13 +43,16 @@ def save_features(self, columns=[]):

feature_file = self.source_file+"._features"
self.data.to_csv(feature_file, sep=',', columns=columns)
print("saving features: "+str(columns))
print("Saved to " + feature_file)


def gc_content(self):
'''
feature: adds gc_content attribute
'''
print ("calculating GC content")

def calc(seq):
g = seq.count('G')
c = seq.count('C')
Expand All @@ -60,6 +66,8 @@ def tataaa_box_present(self):
'''
feature: adds tataaa box attribute
'''
print ("calculating TATAAA box")

def calc(seq):
return int('TATAA' in seq)

Expand All @@ -71,6 +79,8 @@ def gc_box(self):
'''
feature: adds gc box attribute: CCAAT and GGGCGG
'''
print ("calculating GC box")

def calc(seq):
gc = ['CCAAT', 'GGGCGG']
return int(any(s in gc for s in seq))
Expand All @@ -83,6 +93,8 @@ def poly_a_tail(self, n=3):
'''
3 or more As in last 36 nucleotide
'''
print ("calculating poly A tail")

def calc(seq):
if len(seq) < 36:
return 0
Expand All @@ -101,6 +113,8 @@ def stop_codon_present(self):
'''
Checks of on stop codons TAA TGA TAG
'''
print ("calculating stop codons")

def calc(seq):
STOP = ["TAA", "TGA", "TAG"]
# check in any of the stop codons are in the seq
Expand All @@ -110,26 +124,60 @@ def calc(seq):
return self.data


def sequence_length(self):
'''
calculates length of the seq
'''
print("calcualting sequence length")

def calc(seq):
return len(seq)

self.data['seq_length'] = self.data[self.SEQ].apply(calc)
return self.data


def feature_template(self):
'''
feature: adds new attribute
'''
print("calculating ... ? ")
def calc(seq):
# CHANGE THIS
# val = calculate feature value on seq
val = 0
return val

self.data['CHANGE_THIS_to_your_feature_name'] = self.data[self.SEQ].apply(calc)
return self.data


## starts here
decorator(25)

if(len(sys.argv) != 2):
usage_str()
else:
feature_calculator = FeatureCalculator(sys.argv[1])
decorator(25)
print("`feature_calculator` object created, you can now interact with it")
decorator(25)
code.interact(local=dict(globals(), **locals()))

# # interactive run
# decorator(25)
# print("`feature_calculator` object created, you can now interact with it")
# decorator(25)
# code.interact(local=dict(globals(), **locals()))


# calculate features
feature_calculator.gc_content()
feature_calculator.tataaa_box_present()
feature_calculator.gc_box()
feature_calculator.poly_a_tail()
feature_calculator.stop_codon_present()
feature_calculator.sequence_length()

# save features to file
feature_calculator.save_features(feature_calculator.feature_columns())

print("DONE")

decorator(25)
37 changes: 22 additions & 15 deletions file_splitter.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
'''
file chop chop
usage: python file_splitter.py file NUMBER_OF_CHOPS
usage: python file_splitter.py <file> <splits>
'''

import sys
import os

def usage_str():
'''
usage string
'''
print("usage: python file_splitter.py file number_of_parts")
print("usage: python file_splitter.py <file> <splits>")

def decorator(n):
'''
'''
print("*"*n)

def split_file(path, chops):
def split_file(path, splits):
'''
splits file into n chops
splits file
'''
file = open(path)

input_file = open(path)
# read header line
header = file.readline()
# read resst of the data
data = file.readlines()

header = input_file.readline()
# read rest of the data
data = input_file.readlines()
# total lines
total_lines = len(data)
# calculate line per chop
lines_per_chop = int(total_lines / chops) + (total_lines % chops > 0)
lines_per_split = int(total_lines / splits) + (total_lines % splits > 0)

# create output dir
output_path = path + ".splits"
if not os.path.exists(output_path):
os.makedirs(output_path)

# write into each split
print("Part files:")
for i in range(chops):
for i in range(splits):
# create a part file
part_file = path+".part"+str(i+1)
part_file = output_path+"/part"+str(i+1)
print(part_file)
part_file = open(part_file, "w")

# write header
part_file.write(header)

# write data
for line in data[i*lines_per_chop : i*lines_per_chop + lines_per_chop]:
for line in data[i*lines_per_split : i*lines_per_split + lines_per_split]:
part_file.write(line)

# close file
Expand All @@ -66,5 +72,6 @@ def split_file(path, chops):
else:
split_file(sys.argv[1], chops)

print("DONE")

decorator(25)
print("DONE")
25 changes: 16 additions & 9 deletions merge_feature_file.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,44 @@
'''
file join
usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile
usage: python merge_features.py directory output_file
'''

import sys
import os

def usage_str():
'''
usage string
'''
print("usage: python file_splitter.py file1 file2 [file3] ... [fileN] outputfile")
print("usage: python merge_features.py <directory> <output_file>")

def decorator(n):
'''
'''
print("*"*n)

def join_files(input_files, output_file):
def join_files(directory, output_file):
'''
join all input_files into one output_file
'''
print("writing to :"+output_file)
# Files in directory
print("files in the directory:")
input_files = []
for file in os.listdir(directory):
file = directory + "/" + file
print(file)
input_files.append(file)

print("\nwriting to :"+output_file)
output_file = open(output_file, "w")

write_header = False
for file in input_files:
print("reading from: "+file)
file = open(file)
header = file.readline()
data = file.readlines()

# write header once
if not write_header:
output_file.write(header)
write_header = True
Expand All @@ -44,16 +53,14 @@ def join_files(input_files, output_file):
output_file.close()




# **** Starts here ****
decorator(25)

# read file and p from args
if(len(sys.argv) < 3):
usage_str()
else:
join_files(sys.argv[1:-1], sys.argv[-1])
join_files(sys.argv[1], sys.argv[2])
print("DONE")

decorator(25)
print("DONE")
50 changes: 30 additions & 20 deletions process_fasta.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
'''
clean and format fasta files
usage: python pocess_fasta.py file1 [file2] ... [fileN]
default output: <input_file_path>.clean
usage: python pocess_fasta.py <fasta_file> [output_file]
'''

import sys

COLUMNS = ['gene_stable_id', 'transcript_stable_id', 'seq']

def usage_str():
'''
usage string
'''
print("usage: python pocess_fasta.py <fasta_file> [output_file]")

def decorator(n):
'''
'''
Expand All @@ -25,48 +32,51 @@ def format_data_line(line):
data_line = line[:15] + "," + line[15:30] + "," + line[30:]
return data_line

def fasta_format(path):
def fasta_format(path, output_file):
'''
cleans and formats file and writes to file.clean
removes Sequence unavailable lines too
'''
print("formatting file: "+path)
# create a file handler
file = open(path, 'r')
input_file = open(path, 'r')

# read, replace new line, split at >
lines = file.read()\
.replace("\n", "")\
.replace("|", "")\
.split(">")[1:]
lines = input_file.read()\
.replace("\n", "")\
.replace("|", "")\
.split(">")[1:]

# close file
file.close()
input_file.close()

# write to a new file
new_file = open(path+".clean", "w")
# write to output file
print("writing to: "+output_file)
output_file = open(output_file, "w")

# write header line
new_file.write(header_line()+"\n")
output_file.write(header_line()+"\n")
# write data lines
for line in lines:
if(not "Sequence unavailable" in line):
new_file.write(format_data_line(line) + "\n")
output_file.write(format_data_line(line) + "\n")

# flush and close file.clean
new_file.flush()
new_file.close()
output_file.flush()
output_file.close()


# **** Starts here ****
decorator(25)
# read file from args
if(len(sys.argv) < 2):
print("usage: python pocess_fasta.py file1 [file2] ... [fileN]")
usage_str()
else:
for file in (sys.argv[1:]):
# formats file
print("formatting file: "+file)
fasta_format(file)
try:
fasta_format(path=sys.argv[1], output_file=sys.argv[2])
except:
fasta_format(path=sys.argv[1], output_file=sys.argv[1]+".clean")

print("DONE")

decorator(25)
print("DONE")

0 comments on commit 4a8fb1a

Please sign in to comment.