-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeptides.py
116 lines (84 loc) · 3.73 KB
/
peptides.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
This is the core part of the Python program. I created nested functions where the
first function reads the FASTA file as a reference sequence and the second is the
peptide match and records each output on a new line in the csv file.
REQUIREMENTS
The entire protein sequence from FASTA must be written in a single line
(no breaks in between). Please visit here for guidance:
https://github.com/ying-li-python/fasta-fix
"""
def peptideSearch(peptide, phospho_site):
# the first function (no arguments needed)
def fastaFile():
# create variable to store reference sequence
ref_seq = ''
# open your FASTA file
fastaFile = open("period.fasta", 'r')
# create for loop
for line in fastaFile:
# ignore first line in FASTA
if line.startswith(">"):
# this does nothing, allows script to keep going
pass
# next line is your reference sequence, so will be stored as ref_seq
else:
ref_seq = line
# the function will return the ref_seq
return ref_seq
# assign output variable to store ref_seq
output = fastaFile()
# this is the main function to perform the peptide match
def peptideMatch(peptide, phospho_site):
# import dependencies
import re
import csv
import os
# create variables to store lists
peptide_list = []
start_list = []
end_list = []
p_site_list = []
# use regex module to match the peptide sequece to reference sequence
match = re.search(peptide, output)
# for every peptide entered, add to peptide list
peptide_list.append(peptide)
# set phosphorylation position to 0
phospho_position = 0
# create conditional if peptide matches
if match:
# create variable to store the location of start and end position
start_position = match.start()
end_position = match.end()
# add 1 to start and end position to correct indexing
start_position = int(start_position) + 1
end_position = int(end_position) + 1
# add results to list
start_list.append(start_position)
end_list.append(end_position)
# calculate the phosphorylation site
if int(phospho_site) > 0:
phospho_position = (start_position - 1) + phospho_site
p_site_list.append(phospho_position)
# if 0 is entered, return "None"
else:
phospho_position = "None"
p_site_list.append(phospho_position)
# print results in terminal
print("-----------------")
print(f"Peptide matched and added to analysis!")
print("-----------------")
# compile all the lists as a tuple
peptide_search_results = zip(peptide_list, start_list, end_list, p_site_list)
# add peptide sequence and results to current csv file
with open('peptideSearch_output.csv', 'a') as csvfile:
# initiate csvwriter
peptide_writer = csv.writer(csvfile, delimiter=',')
# write the results to each row for every peptide entered
peptide_writer.writerows(peptide_search_results)
# print an error message if peptide sequence not matched
else:
print("-----------------")
print("Peptide not found in reference FASTA sequence. Please try again.")
print("-----------------")
# return the results
return peptideMatch(peptide, phospho_site)