ap_SWAligner

Calculate all pairwise local sequence alignments (Smith&Waterman algorithm) between sequences read from a FASTA file. For every pair of sequences it prints three columns: query length, template length and the alignment score.

USAGE:
ap_SWAligner test_inputs/ferrodoxins.fasta test_inputs/ferrodoxins.fasta

Keywords:

Categories:

  • core/alignment/SWAligner

Input files:

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <iostream>
#include <chrono>
#include <algorithm>

#include <core/data/io/fasta_io.hh>

#include <core/data/sequence/Sequence.hh>
#include <core/alignment/SWAligner.hh>
#include <core/alignment/scoring/SimilarityMatrix.hh>
#include <core/alignment/scoring/SimilarityMatrixScore.hh>
#include <core/alignment/scoring/NcbiSimilarityMatrixFactory.hh>
#include <core/alignment/PairwiseSequenceAlignment.hh>
#include <core/data/io/alignment_io.hh>

#include <utils/exit.hh>

std::string program_info = R"(

Calculate all pairwise local sequence alignments (Smith&Waterman algorithm) between sequences read from a FASTA file.
For every pair of sequences it prints three columns: query length, template length and the alignment score.

USAGE:
    ap_SWAligner test_inputs/ferrodoxins.fasta test_inputs/ferrodoxins.fasta

)";

/** @brief Calculate all pairwise sequence alignments between sequences read from two FASTA files : query and database
 *
 * CATEGORIES: core/alignment/SWAligner
 * KEYWORDS:   FASTA input; SWAligner; sequence alignment
 * GROUP:      Alignments
 */
int main(const int argc, const char* argv[]) {

  if(argc < 3) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using namespace core::data::io;
  using namespace core::data::sequence;
  using namespace core::alignment::scoring;

  // --- the query sequence
  std::vector<std::shared_ptr<Sequence>> query_sequences;
  read_fasta_file(argv[1], query_sequences);
  // --- container for the sequence database
  std::vector<std::shared_ptr<Sequence>> db_sequences;
  read_fasta_file(argv[2], db_sequences);

  // --- find longest sequence to initialize aligner object large enough
  unsigned max_len = 0;
  std::for_each(query_sequences.begin(), query_sequences.end(),
    [&max_len](const Sequence_SP s) { max_len = std::max(max_len, unsigned(s->length())); });
  std::for_each(db_sequences.begin(), db_sequences.end(),
    [&max_len](const Sequence_SP s) { max_len = std::max(max_len, unsigned(s->length())); });

  // ---------- Create aligner object
  core::alignment::SWAligner<short, SimilarityMatrixScore<short>> aligner(max_len);

  // ---------- read similarity matrix from a file (e.g. BLOSUM62)
  NcbiSimilarityMatrix_SP sim_m = NcbiSimilarityMatrixFactory::get().get_matrix((argc > 3) ? argv[3] : "BLOSUM62");

  // ---------- Go through all db sequences and align them with the given query
  auto start = std::chrono::high_resolution_clock::now(); // --- timer starts!
  for (size_t i = 0; i < query_sequences.size(); ++i) {
    for (size_t j = 0; j < db_sequences.size(); ++j) {

      // ---------- Here we create a sequence similarity object that will score a match
      // ---------- between individual positions from the two sequences being aligned
      SimilarityMatrixScore<short> score(query_sequences[i]->sequence, db_sequences[j]->sequence, *sim_m);

      // ---------- calculate local alignment
      aligner.align(-10, -1, score);

      // ---------- Convert the abstract alignment to a pairwise sequence alignment object
      const core::alignment::PairwiseAlignment_SP  ali = aligner.backtrace();
      core::alignment::PairwiseSequenceAlignment seq_ali(ali, query_sequences[i], db_sequences[j]);

      // ---------- Print the alignment in Edinburgh format
      core::data::io::write_edinburgh(seq_ali, std::cout, 80);
    }
  }
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
  std::cerr << db_sequences.size() * query_sequences.size() << " local alignment scores computed within "
            << time_span.count() << " [s]\n";
}
../_images/file_icon.png