Commit c1e5d044 by re

直播

parent f3b016b9
Unnamed
Unnamed
Chem3D Core 21.010272417003D
55 57 0 0 0 0 0 0 0 0999 V2000
-5.7857 1.9733 -1.0316 C 0 0 0 0 0 0 0 0 0 0 0 0
-6.0640 0.6117 -0.9700 C 0 0 0 0 0 0 0 0 0 0 0 0
-5.0178 -0.2959 -0.8280 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.6902 0.1696 -0.7480 C 0 0 0 0 0 0 0 0 0 0 0 0
-3.3983 1.5375 -0.8089 C 0 0 0 0 0 0 0 0 0 0 0 0
-4.4620 2.4205 -0.9508 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.5904 -0.6658 -0.6061 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.3187 -0.2308 -0.5289 N 0 0 0 0 0 0 0 0 0 0 0 0
-1.1654 1.0983 -0.5987 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.1444 2.0067 -0.7352 N 0 0 0 0 0 0 0 0 0 0 0 0
-6.7957 2.8660 -1.1697 O 0 0 0 0 0 0 0 0 0 0 0 0
-7.3433 0.1721 -1.0475 O 0 0 0 0 0 0 0 0 0 0 0 0
0.0187 1.5409 -0.5293 N 0 0 0 0 0 0 0 0 0 0 0 0
1.1446 0.6077 -0.3792 C 0 0 0 0 0 0 0 0 0 0 0 0
2.4496 1.3907 -0.3210 C 0 0 0 0 0 0 0 0 0 0 0 0
3.6161 0.4239 -0.1656 C 0 0 0 0 0 0 0 0 0 0 0 0
4.8585 1.1694 -0.1101 N 0 0 0 0 0 0 0 0 0 0 0 0
6.0486 0.5070 0.0284 C 0 0 0 0 0 0 0 0 0 0 0 0
6.0674 -0.9989 0.1233 C 0 0 0 0 0 0 0 0 0 0 0 0
6.7041 -1.4265 1.4400 C 0 0 0 0 0 0 0 0 0 0 0 0
7.9392 -2.2142 0.9971 C 0 0 0 0 0 0 0 0 0 0 0 0
7.5117 -2.6464 -0.3974 C 0 0 0 0 0 0 0 0 0 0 0 0
6.8390 -1.5330 -0.9214 O 0 0 0 0 0 0 0 0 0 0 0 0
7.0837 1.1281 0.0746 O 0 0 0 0 0 0 0 0 0 0 0 0
-8.0339 2.2113 -1.2306 C 0 0 0 0 0 0 0 0 0 0 0 0
-7.3951 -1.2265 -0.9651 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.7961 -1.9135 -0.5456 N 0 0 0 0 0 0 0 0 0 0 0 0
0.2677 2.9880 -0.5986 C 0 0 0 0 0 0 0 0 0 0 0 0
-5.2273 -1.3746 -0.7784 H 0 0 0 0 0 0 0 0 0 0 0 0
-4.2580 3.5003 -1.0009 H 0 0 0 0 0 0 0 0 0 0 0 0
1.0200 0.0249 0.5607 H 0 0 0 0 0 0 0 0 0 0 0 0
1.1680 -0.0879 -1.2478 H 0 0 0 0 0 0 0 0 0 0 0 0
2.5742 1.9736 -1.2610 H 0 0 0 0 0 0 0 0 0 0 0 0
2.4262 2.0863 0.5476 H 0 0 0 0 0 0 0 0 0 0 0 0
3.4915 -0.1589 0.7744 H 0 0 0 0 0 0 0 0 0 0 0 0
3.6394 -0.2717 -1.0341 H 0 0 0 0 0 0 0 0 0 0 0 0
4.8459 2.1793 -0.1737 H 0 0 0 0 0 0 0 0 0 0 0 0
5.0195 -1.3686 0.0605 H 0 0 0 0 0 0 0 0 0 0 0 0
6.0110 -2.0701 2.0266 H 0 0 0 0 0 0 0 0 0 0 0 0
6.9512 -0.5671 2.1027 H 0 0 0 0 0 0 0 0 0 0 0 0
8.8466 -1.5705 0.9665 H 0 0 0 0 0 0 0 0 0 0 0 0
8.2213 -3.0511 1.6744 H 0 0 0 0 0 0 0 0 0 0 0 0
6.8325 -3.5265 -0.3448 H 0 0 0 0 0 0 0 0 0 0 0 0
8.3759 -2.9532 -1.0282 H 0 0 0 0 0 0 0 0 0 0 0 0
-8.8466 2.9634 -1.3438 H 0 0 0 0 0 0 0 0 0 0 0 0
-8.0449 1.5199 -2.1027 H 0 0 0 0 0 0 0 0 0 0 0 0
-8.1932 1.6314 -0.2941 H 0 0 0 0 0 0 0 0 0 0 0 0
-8.4538 -1.5637 -1.0309 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.8126 -1.6689 -1.8040 H 0 0 0 0 0 0 0 0 0 0 0 0
-6.9602 -1.5572 0.0046 H 0 0 0 0 0 0 0 0 0 0 0 0
-3.7756 -2.2875 -0.6027 H 0 0 0 0 0 0 0 0 0 0 0 0
-1.9872 -2.5744 -0.4384 H 0 0 0 0 0 0 0 0 0 0 0 0
-0.6998 3.5265 -0.7112 H 0 0 0 0 0 0 0 0 0 0 0 0
0.7713 3.3225 0.3359 H 0 0 0 0 0 0 0 0 0 0 0 0
0.9199 3.2106 -1.4726 H 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0
1 6 1 0
1 11 1 0
3 2 1 0
2 12 1 0
4 3 2 0
3 29 1 0
5 4 1 0
7 4 1 0
6 5 2 0
5 10 1 0
6 30 1 0
8 7 2 0
7 27 1 0
9 8 1 0
10 9 2 0
9 13 1 0
11 25 1 0
12 26 1 0
13 14 1 0
13 28 1 0
14 15 1 0
14 31 1 0
14 32 1 0
15 16 1 0
15 33 1 0
15 34 1 0
16 17 1 0
16 35 1 0
16 36 1 0
17 18 1 0
17 37 1 0
18 19 1 0
18 24 2 0
19 20 1 0
23 19 1 0
19 38 1 0
20 21 1 0
20 39 1 0
20 40 1 0
21 22 1 0
21 41 1 0
21 42 1 0
23 22 1 0
22 43 1 0
22 44 1 0
25 45 1 0
25 46 1 0
25 47 1 0
26 48 1 0
26 49 1 0
26 50 1 0
27 51 1 0
27 52 1 0
28 53 1 0
28 54 1 0
28 55 1 0
M END
$$$$
fasta_filename=FASTA
fasta_filename=FASTA
neighbor_fasta_filename=NEIGHBPR_FASTA
neighbor-metadata-filename=METADATA
s1_filename=S1
s2_filename=S2
s_filename=S
# python preprocess_data.py \
# --fasta-filename ${fasta_filename} \
# --neighbor-fasta-filename ${neighbor_fasta_filename} \
# --neighbor-metadata-filename ${neighbor_metadata_filename} \
# --split True \
# --mask True \
# --s1-filename ${s1_filename} \
# --s2-filename ${s2_filename} \
# --s-filename ${s_filename}
# demo for using neighbor_file
# python preprocess_data.py \
# --fasta-filename S_t_negihbor.fa \
# --neighbor-fasta-filename S_t_1_neighbor.fa\
# --neighbor-metadata-filename t_1_neighbor.csv \
# --split True \
# --mask True \
# --s1-filename s1.csv \
# --s2-filename s2.csv \
# --s-filename s.csv
python preprocess_data.py \
--fasta-filename ${fasta_filename} \
--mask True \
--wo-neighbor True \
--s1-filename ${s1_filename} \
--s2-filename ${s2_filename} \
--s-filename ${s_filename}
import random
import random
import pandas as pd
import numpy as np
def parse_args():
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--fasta-filename', type=str, default=None,
help='t time slice spike fasta file name')
parser.add_argument('--neighbor-fasta-filename', type=str, default=None,
help='t-1 time slice spike fasta file name')
parser.add_argument('--neighbor-metadata-filename', type=str, default=None,
help='the neighbor metadata filename, the column names are id and t_1, id is t time slice spike id and t_1 is t-1 time slice spike id')
parser.add_argument('--split', type=bool, default=None,
help='set to True to split spike to s1 and s2 to introduce neighbor sequence')
parser.add_argument('--wo-neighbor', type=bool, default=None,
help='set to True to split spike to s1 and s2 and not to introduce neighbor sequence')
parser.add_argument('--mask', type=bool, default=None,
help='set to True to randomly continuously mask amino acids')
parser.add_argument('--s1-filename', type=str, default='s1_seq_to_seq_demo.csv',
help='s1 spike preprocess results file name')
parser.add_argument('--s2-filename', type=str, default='s2_seq_to_seq_demo.csv',
help='s2 spike preprocess results file name')
parser.add_argument('--s-filename', type=str, default='s_seq_to_seq_demo.csv',
help='spike preprocess results file name')
args = parser.parse_args()
args.fasta_filename = 'S.t_neighbor.fa'
args.neighbor_fasta_filename = 'S.t_1_neighbor.fa'
args.neighbor_metadata_filename = 't_1_neighbor.csv'
args.split = True
args.wo_neighbor = False
args.mask = True
args.s1_filename = "s1_results_demo.csv"
args.s2_filename = "s2_results_demo.csv"
args.s_filename = "s_results_demo.csv"
return args
def read_fasta(fasta_name):
data = {'id': [], 'sequence': []}
with open(fasta_name, 'r') as f:
seq = ''
key = ''
for line in f:
line = line.strip()
if line[0] == '>':
if key:
data['id'].append(key)
data['sequence'].append(seq)
seq = ''
key = line[1:]
else:
seq += line
if key:
data['id'].append(key)
data['sequence'].append(seq)
data_df = pd.DataFrame(data)
return data_df
def get_mask_index(n):
# mask_num= int(n*0.15)
mask_num = 5
idx=np.arange(0,n).tolist()
remain=idx
mask=[]
# tri_mask_num = random.randrange(0, int(mask_num/3)-1)
tri_mask_num = 1
i=0
while i <tri_mask_num:
x=random.choice(remain)
if (x in remain) and (x+1 in remain)and (x+2 in remain):
remain.remove(x)
remain.remove(x+1)
remain.remove(x+2)
mask.append(x)
mask.append(x+1)
mask.append(x+2)
i+=1
double_mask_num = random.randrange(0, int((mask_num-len(mask))/2))
i=0
while i <double_mask_num:
x=random.choice(remain)
if (x in remain) and (x+1 in remain):
remain.remove(x)
remain.remove(x+1)
mask.append(x)
mask.append(x+1)
i+=1
while len(mask) <mask_num:
x=random.choice(remain)
if (x in remain) :
remain.remove(x)
mask.append(x)
i+=1
return mask
def main():
args = parse_args()
print('load {} data...'.format(args.fasta_filename))
data_df = read_fasta(args.fasta_filename)
if args.split:
print('load neighbor(t-1) {} data && join to t data...'.format(args.neighbor_fasta_filename))
neighbor_fasta_df = read_fasta(args.neighbor_fasta_filename)
neighbor_fasta_df = neighbor_fasta_df.rename(columns = {'id':'nearest_neighbor_t-1', 'sequence':'neighbor_sequence'})
neighbor_df =pd.read_csv(args.neighbor_metadata_filename)
step_one_df = pd.merge(data_df, neighbor_df, how='inner',on='id')
step_two_df = pd.merge(step_one_df, neighbor_fasta_df, how='inner',left_on='t_1' ,right_on='nearest_neighbor_t-1')
input_df = step_two_df.drop_duplicates(keep='first')
else:
input_df = data_df
input_df = input_df.drop_duplicates(subset=['sequence'])
if args.mask:
print('randomly continuous mask amino acids...')
sequence_list = input_df['sequence'].to_list()
input_df = input_df.drop(columns=['sequence'])
mask_sequence_list = []
n = len(sequence_list)
for s in sequence_list:
mask = get_mask_index(len(s))
mask_s=[s[i] if i not in mask else '-' for i in range(len(s))]
mask_s=''.join(mask_s)
mask_sequence_list.append(mask_s)
input_df['sequence'] = mask_sequence_list
def get_s1(s):
return s[:688]
def get_s2(s):
return s[688:]
input_df['s1'] = input_df['sequence'].apply(get_s1)
input_df['s2'] = input_df['sequence'].apply(get_s2)
if args.split:
input_df['neighbor_s1'] = input_df['neighbor_sequence'].apply(get_s1)
input_df['neighbor_s2'] = input_df['neighbor_sequence'].apply(get_s2)
input_df['s1_input'] = input_df['s1'] + ' ' + input_df['neighbor_s1']
input_df['s1_output'] = ' '
input_df[['s1_input', 's1_output']].to_csv(args.s1_filename, index=0, header=['input','output'])
input_df['s2_input'] = input_df['s2'] + ' ' + input_df['neighbor_s2']
input_df['s2_output'] = ' '
input_df[['s2_input', 's2_output']].to_csv(args.s2_filename, index=0, header=['input','output'])
print('s1 spike w/ neighbor(t-1) output to {}.\n'.format(args.s1_filename), \
's2 spike w/ neighbor(t-1) output to {}.'.format(args.s2_filename))
if args.wo_neighbor:
input_df['s1_output'] = ' '
input_df[['s1', 's1_output']].to_csv(args.s1_filename, index=0, header=['input','output'])
input_df['s2_output'] = ' '
input_df[['s2', 's2_output']].to_csv(args.s2_filename, index=0, header=['input','output'])
print('s1 spike w/o neighbor(t-1) output to {}.\n'.format(args.s1_filename), \
's2 spike w/o neighbor(t-1) output to {}.'.format(args.s2_filename))
input_df['output'] = ' '
input_df[['sequence', 'output']].to_csv(args.s_filename, index=0, header=['input','output'])
print('s spike w/o neighbor(t-1) output to {}'.format(args.s_filename))
if __name__ == "__main__":
main()
This source diff could not be displayed because it is too large. You can view the blob instead.
id,t_1,identity
id,t_1,identity
OEAV18564343,OEAV18606525,99.608
OEAV18565104,OEAV18565274,99.765
OEAV18565025,OEAV18369048,99.738
OEAV18606592,OEAV18504649,99.738
OEAV18606630,OEAV18369048,99.686
OEAV18631882,OEAV18367639,99.738
OEAV18631890,OEAV18367639,99.738
OEAV18632626,OEAV18367639,99.738
OEAV18633181,OEAV18369048,99.66
OEAV18633554,OEAV18367639,99.738
OEAV18564311,OEAV18606525,99.608
OEAV18564312,OEAV18606525,99.608
OEAV18564313,OEAV18505053,99.608
OEAV18564315,OEAV18369061,99.608
OEAV18564316,OEAV18606525,99.608
OEAV18564317,OEAV18606525,99.581
OEAV18564318,OEAV18606525,99.608
OEAV18564319,OEAV18606525,99.581
OEAV18564322,OEAV18606525,99.608
OEAV18564321,OEAV18606525,99.555
OEAV18564340,OEAV18565164,99.608
OEAV18565118,OEAV18565164,99.555
OEAV18565286,OEAV18565164,99.581
OEAV18565308,OEAV18565164,99.608
OEAV18584287,OEAV18285490,99.608
OEAV18582714,OEAV18565164,99.608
OEAV18606720,OEAV18565164,99.608
OEAV18565318,OEAV18565164,99.581
OEAV18565340,OEAV18565164,99.608
OEAV18630714,OEAV18565164,99.608
OEAV18730653,OEAV18565164,99.555
OEAV18730662,OEAV18565164,99.555
OEAV18730678,OEAV18565164,99.555
OEAV18731668,OEAV18565164,99.555
OEAV18731691,OEAV18565164,99.555
OEAV18731694,OEAV18565164,99.555
OEAV18730681,OEAV18565164,99.555
OEAV18730758,OEAV18565164,99.555
OEAV18730757,OEAV18565164,99.555
OEAV18730774,OEAV18565164,99.555
OEAV18630736,OEAV18505057,99.581
OEAV18630632,OEAV18505057,99.581
OEAV18630789,OEAV18505057,99.581
OEAV18630811,OEAV18505057,99.503
OEAV18630841,OEAV18505057,99.581
OEAV18630855,OEAV18505057,99.581
OEAV18633617,OEAV18505057,99.581
OEAV18631147,OEAV18505057,99.555
OEAV18633273,OEAV18505057,99.581
OEAV18633699,OEAV18505057,99.503
OEAV18993406,OEAV18369339,99.267
OEAV18991923,OEAV18369339,99.294
\ No newline at end of file
python postprocess_data.py \
python postprocess_data.py \
--s1-filename=s1_results_demo.csv \
--s2-filename=s2_demo_results.csv \
--s-results-filename=s_results_demo.csv
\ No newline at end of file
python postprocess_data.py \
python postprocess_data.py \
--s1-filename=s1_results_demo.csv \
--s2-filename=s2_demo_results.csv \
--s-results-filename=s_results_demo.csv
\ No newline at end of file
import pandas as pd
import pandas as pd
from pandas import DataFrame
def parse_args():
import argparse
parser = argparse.ArgumentParser(description='Post process the csc score and grammer of generated spike protein sequence, only for CoT2G-F method.')
parser.add_argument('--s1-filename', type=str, default=None,
help='s1 generation results file name.')
parser.add_argument('--s2-filename', type=str, default=None,
help='s2 generation results file name.')
parser.add_argument('--s-results-filename', type=str, default=None,
help='s1 and s2 merged results file name.')
args = parser.parse_args()
return args
def main():
args = parse_args()
if args.s1_filename is None:
print("Please indict s1 protain generation file name !!")
return
elif args.s2_filename is None:
print("Please indict s2 protain generation file name !!")
return
s1_df = pd.read_csv(args.s1_filename)
s2_df = pd.read_csv(args.s2_filename)
if s1_df.shape[0] != s2_df.shape[0]:
print("The count of generated s1 protain and generated s2 protain is suposed to be equal !!")
return
s = {'pred_sequences':[],'pred_mutations':[],'csc':[],\
'grammer':[]}
s_df = DataFrame(s)
s_df['pred_sequences'] = s1_df['pred_sequences'] + s2_df['pred_sequences']
s_df['pred_mutations'] = s1_df['pred_mutations'] +' '+ s2_df['pred_mutations']
s_df['csc'] = (s1_df['csc'] + s2_df['csc']) / 2
s_df['grammer'] = (s1_df['grammer'] + s2_df['grammer']) / 2
s_df.to_csv(args.s_results_filename, index=0)
print("The merge generated results are saved into {}.".format(args.s_results_filename))
if __name__ == "__main__":
main()
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
3.854415274463007246e-01 0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 3.701467387488588390e-01 1.843457636052180382e-01
3.854415274463007246e-01 0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 3.701467387488588390e-01 1.843457636052180382e-01
2.684964200477326091e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 3.701467387488588390e-01 1.843457636052180382e-01
0.000000000000000000e+00 6.666666666666666297e-01 5.000000000000000000e-01 1.000000000000000000e+00 3.333333333333333703e-01 2.103840483044302490e-01 6.758466201080510771e-01
5.369928400954654402e-02 6.666666666666666297e-01 5.000000000000000000e-01 1.000000000000000000e+00 3.333333333333333703e-01 3.473285122516323042e-01 6.758466201080510771e-01
3.233890214797136564e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 8.310398090289965456e-01 1.843457636052180382e-01
3.818615751789976032e-01 3.333333333333333148e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 2.653935266446673658e-01 1.843457636052180382e-01
5.608591885441527314e-01 1.000000000000000000e+00 1.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 3.827845257319384409e-01 5.622611674792461489e-01
3.782816229116945927e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 7.620234501158462681e-01 1.843457636052181492e-01
8.138424821002385734e-01 1.000000000000000000e+00 1.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 4.400758267218984887e-01 5.272104361575966625e-01
3.830548926014320510e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 7.620234501158462681e-01 1.843457636052181492e-01
3.544152744630071905e-01 0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00 0.000000000000000000e+00 6.591659060591167352e-01 1.843457636052180382e-01
3.150357995226730767e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 0.000000000000000000e+00 7.521412570826193633e-01
4.212410501193317169e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 5.438812048023590195e-01 0.000000000000000000e+00
3.436754176610978262e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 1.369444639472022773e-01 7.521412570826193633e-01
1.000000000000000000e+00 1.000000000000000000e+00 1.000000000000000000e+00 1.000000000000000000e+00 9.999999999999998890e-01 1.361616232535282078e-01 1.000000000000000000e+00
3.472553699284009476e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 4.103770273116591483e-02 4.509157991830281542e-01
3.424821002386633784e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 1.774204872568978519e-01 4.509157991830281542e-01
3.806682577565632664e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 6.250789861686442128e-01 1.843457636052181492e-01
3.723150357995225757e-01 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 3.333333333333333703e-01 9.999999999999998890e-01 3.924100672025299108e-01
3.448687350835322740e-01 3.333333333333333148e-01 0.000000000000000000e+00 1.000000000000000000e+00 3.333333333333333703e-01 7.276907954784807009e-01 4.509157991830281542e-01
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment