ap_PairwiseSequenceIdentityProtocol

ap_PairwiseSequenceIdentityProtocol evaluates pairwise sequence identity between all sequences found in a given FASTA file. The calculated values are printed on the screen if they are greater than a given cutoff (third parameters, 0.25 in the example below). Calculations may be executed in several parallel threads, the number of threads is the second parameter of this program In addition, the calculations may be performed for a single sequence (against all the other sequences) or for a range of sequences. The query sequence or sequence range must be provided as fourth, or fourth and fifth parameters, respectively

USAGE:
ap_PairwiseSequenceIdentityProtocol input.fasta [n_threads [seqID_cutoff [first [last]] ] ]
EXAMPLES:
ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25
ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25 0 10

Categories:

  • core/protocols/PairwiseSequenceIdentityProtocol.hh

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include <core/index.hh>
#include <utils/exit.hh>
#include <utils/Logger.hh>
#include <core/data/io/fasta_io.hh>
#include <core/protocols/PairwiseSequenceIdentityProtocol.hh>

std::string program_info = R"(

ap_PairwiseSequenceIdentityProtocol evaluates pairwise sequence identity between all sequences found in a given FASTA file.
The calculated values are printed on the screen if they are greater than a given cutoff (third parameters, 0.25 in the example below).
Calculations may be executed in several parallel threads, the number of threads is the second parameter of this program
In addition, the calculations may be performed for a single sequence (against all the other sequences) or for a range of sequences.
The query sequence or sequence range must be provided as fourth, or fourth and fifth parameters, respectively

USAGE:
    ap_PairwiseSequenceIdentityProtocol input.fasta [n_threads [seqID_cutoff [first [last]] ] ]
EXAMPLES:
    ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25
    ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25 0 10

)";

/** @brief Uses PairwiseSequenceIdentityProtocol protocol to calculate all pairwise sequence identity values for a set of sequences
 *
 * ap_PairwiseSequenceIdentityProtocol evaluates pairwise sequence identity between all sequences found in a given FASTA file.
 * The calculated values are printed on the screen if they are greater than a given cutoff (third parameters, 0.25 in the example below).
 * Calculations may be executed in several parallel threads, the number of threads is the second parameter of this program
 * In addition, the calculations may be performed for a single sequence (against all the other sequences) or for a range of sequences.
 * The query sequence or sequence range must be provided as fourth, or fourth and fifth parameters, respectively
 * 
 * USAGE:
 *     ap_PairwiseSequenceIdentityProtocol input.fasta [n_threads [seqID_cutoff [first [last]] ] ]
 * EXAMPLES:
 *     ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25
 *     ap_PairwiseSequenceIdentityProtocol small500_95identical.fasta 4 0.25 0 10
 * CATEGORIES: core/protocols/PairwiseSequenceIdentityProtocol.hh
 * KEYWORDS:   FASTA input; sequence alignment
 * GROUP:      Alignments
 */
int main(const int argc, const char* argv[]) {

  if(argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using namespace core::data::sequence;
  using namespace core::protocols;
  using namespace core::alignment;

  utils::Logger logs("ap_PairwiseSequenceIdentityProtocol");
  int my_argc = argc;
  bool if_use_fasta = false;
  if (strstr(argv[my_argc-1],"fasta")!=NULL) {
    if_use_fasta = true;
    --my_argc;
  }
  core::index2 n_threads = (my_argc > 2) ? atoi(argv[2]) : 4;
  float cutoff = (my_argc > 3) ? atof(argv[3]) : 0.25;

  logs << utils::LogLevel::INFO << "number of threads used : " << n_threads << "\n";
  logs << utils::LogLevel::INFO << "seq. similarity cutoff : " << cutoff << "\n";
  logs << utils::LogLevel::INFO << "s : " << cutoff << "\n";

  core::protocols::PairwiseSequenceIdentityProtocol protocol;
  protocol.printed_seqname_length(5).gap_open(-10).gap_extend(-1).substitution_matrix("BLOSUM62").
    keep_alignments(true).alignment_method(AlignmentType::SEMIGLOBAL_ALIGNMENT).n_threads(n_threads);
  protocol.if_use_fasta_filter(if_use_fasta).seq_identity_cutoff(cutoff).batch_size(10000);

  if (my_argc == 5) {
    protocol.select_query(atoi(argv[4]));
    logs << utils::LogLevel::INFO << "Using sequence at index " << atoi(argv[4]) << " as a query\n";
  }
  if (my_argc == 6) {
    for (core::index4 i = atoi(argv[4]); i <= atoi(argv[5]); ++i) protocol.add_query(i);
    logs << utils::LogLevel::INFO << "Using " << atoi(argv[5]) - atoi(argv[4]) << " query sequences\n";
  }
  
  std::vector<Sequence_SP> input_sequences;
  core::data::io::read_fasta_file(argv[1], input_sequences);
  for(Sequence_SP si: input_sequences) protocol.add_input_sequence(si);

  auto start = std::chrono::high_resolution_clock::now(); // --- timer starts!
  protocol.run();
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
  logs << utils::LogLevel::INFO << (size_t ) protocol.n_jobs_completed()
            << " global alignment sequence identities calculated within " << time_span.count() << " [s]\n";

  protocol.print_header(std::cout);
  protocol.print_sequence_identity(std::cout);
}
../_images/file_icon.png