ap_shuffled_sequence_alignmentΒΆ

Reads a FASTA file with two sequences and calculate global sequence alignment scores with one of the two sequences randomly shuffled N_shuffles times (1000 by default). Each time the reshuffled sequence is aligned to the other one. The statistics of scores from randomised alignments is then used to estimate p-value of the global alignment. The default substitution-matrix is BLOSUM62 The program prints all the randomized alignment scores and estimated p-value of the alignment

USAGE:

ap_shuffled_sequence_alignment input.fasta  [[substitution_matrix] N_shuffles]

EXAMPLE:

ap_shuffled_sequence_alignment input2.fasta  BLOSUM80 10000

Keywords:

Categories:

  • core::alignment::NWAligner

Input files:

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include <iostream>
#include <chrono>

#include <core/data/io/fasta_io.hh>

#include <core/alignment/NWAligner.hh>
#include <core/alignment/scoring/SimilarityMatrix.hh>
#include <core/alignment/scoring/SimilarityMatrixScore.hh>
#include <core/alignment/scoring/NcbiSimilarityMatrixFactory.hh>
#include <core/calc/statistics/OnlineStatistics.hh>
#include <core/calc/statistics/NormalDistribution.hh>
#include <core/calc/statistics/Random.hh>
#include <core/protocols/PairwiseSequenceIdentityProtocol.hh>
#include <utils/exit.hh>

std::string program_info = R"(

Reads a FASTA file with two sequences and calculate global sequence alignment scores
with one of the two sequences randomly shuffled N_shuffles times (1000 by default).
Each time the reshuffled sequence is aligned to the other one. The statistics of scores
from randomised alignments is then used to estimate p-value of the global alignment.
The default substitution-matrix is BLOSUM62

The program prints all the randomized alignment scores and estimated p-value of the alignment

USAGE:
    ap_shuffled_sequence_alignment input.fasta  [[substitution_matrix] N_shuffles]

EXAMPLE:
    ap_shuffled_sequence_alignment input2.fasta  BLOSUM80 10000

)";

/** @brief Calculate global sequence alignment scores with one sequence randomly shuffled and estimates alignment p-value
 *
 * CATEGORIES: core::alignment::NWAligner
 * KEYWORDS:   FASTA input; Needleman-Wunsch; sequence alignment; statistics
 * GROUP:      Alignments
 * IMG: ap_shuffled_sequence_alignment.png
 * IMG_ALT: Statistics of random sequence alignment between 1BC6 and SFL95851.1
 */
int main(const int argc, const char *argv[]) {

  if (argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using namespace core::data::io;
  using namespace core::alignment::scoring;

  core::index2 n_shuffles = (argc > 3) ? atoi(argv[3]) : 1000; // --- The number of random shuffles

  // --- Read the query sequence
  std::vector<std::shared_ptr<Sequence>> input_sequences;
  read_fasta_file(argv[1], input_sequences);

  // --- find longest sequence to initialize aligner object large enough
  unsigned max_len = std::max(input_sequences[0]->length(), input_sequences[1]->length());

  // --- create aligner object
  core::alignment::NWAligner<short, SimilarityMatrixScore<short>> aligner(max_len);

  // --- read similarity matrix from a file (i.e. BLOSUM62)
  std::string substitution_matrix_name = (argc > 2) ? argv[2] : "BLOSUM62";
  NcbiSimilarityMatrix_SP sim_m = NcbiSimilarityMatrixFactory::get().get_matrix("BLOSUM62");

  // --- go through all db sequences and align them with the given query
  auto start = std::chrono::high_resolution_clock::now(); // --- timer starts!

  std::string j_seq_copy = input_sequences[1]->sequence;
  SimilarityMatrixScore<short> score(input_sequences[0]->sequence, j_seq_copy, *sim_m);
  // --- find score of the alignment; just the score - this is faster than aligning and keeping backtracking info
  short result = aligner.align_for_score(-10, -1, score);

  core::calc::statistics::Random & r = core::calc::statistics::Random::get();
  r.seed(12345);  // --- seed the generator for repeatable results
  core::calc::statistics::OnlineStatistics stats; // --- online (on-the fly) statistics calculator
  for (size_t i = 0; i < n_shuffles; ++i) {
    shuffle(j_seq_copy.begin(), j_seq_copy.end(), r);
    SimilarityMatrixScore<short> score(input_sequences[0]->sequence, j_seq_copy, *sim_m);
    short res = aligner.align_for_score(-10, -1, score);
    stats(res);
    std::cout << res << "\n";
  }
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
  std::cerr << "# " << n_shuffles << " alignment shuffled scores computed within "
            << time_span.count() << " [s]\n";
  std::cout << "# alignment score: " << result << "\n";
  std::cout << "# normal p-value, avg, sdev: "
            << 1 - core::calc::statistics::NormalDistribution::cdf(result, stats.avg(), sqrt(stats.var())) << " "
            << stats.avg()
            << " " << sqrt(stats.var()) << "\n";
  core::protocols::PairwiseSequenceIdentityProtocol protocol;
  protocol.substitution_matrix("BLOSUM62").gap_open(-10).gap_extend(-1);
  protocol.add_input_sequence(input_sequences[0]);
  protocol.add_input_sequence(input_sequences[1]);
  protocol.run();
  std::cout << "# same value calculated by a library function: " << protocol.count_identical(0, 1) << "\n";
}
../_images/ap_shuffled_sequence_alignment.png