seq_identity.py

Reads a .fasta file with a set of amino acid sequences and calculates each-vs-each pairwise alignments

using semi-global aligner. Prints only these pairs for which sequence identity is higher than a given cutoff

USAGE:
    python seq_identity.py test_inputs/cyped.CYP109.fasta 0.3

Keywords:

Categories:

  • core/protocols/PairwiseSequenceIdentityProtocol

Input files:

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import sys
sys.path.append('../../../../../bin/')
from pybioshell.std import vector_std_shared_ptr_core_data_sequence_Sequence_t
from pybioshell.core.data.io import read_fasta_file
from pybioshell.core.alignment import AlignmentType
from pybioshell.core.protocols import PairwiseSequenceIdentityProtocol


if len(sys.argv) < 3 :
  print("""

    Reads a .fasta file with a set of amino acid sequences and calculates each-vs-each pairwise alignments
using semi-global aligner. Prints only these pairs for which sequence identity is higher than a given cutoff
USAGE:
    python seq_identity.py test_inputs/cyped.CYP109.fasta 0.3

    CATEGORIES: core/protocols/PairwiseSequenceIdentityProtocol
    KEYWORDS:   FASTA input; pairwise sequence alignment;sequence alignment
    GROUP:      Alignments
    IMG:     ap_aligned-1k6m-1bif.png
  """)
  sys.exit()

cutoff = float(sys.argv[2])
sequences = vector_std_shared_ptr_core_data_sequence_Sequence_t()
read_fasta_file(sys.argv[1], sequences, True)
align_protocol = PairwiseSequenceIdentityProtocol()
n_seq = align_protocol.add_input_sequences(sequences)
align_protocol.n_threads(4).alignment_method(AlignmentType.SEMIGLOBAL_ALIGNMENT)
align_protocol.run()

for i in range(1,n_seq) :
  for j in range(i) :
    seq_id = align_protocol.get_sequence_identity(i,j)
    if seq_id > cutoff : print( i, j, seq_id)
../_images/ap_aligned-1k6m-1bif.png