ap_MSAColumnConservationΒΆ

Reads a Multiple Sequence Alignment (MSA) in ClustalW or FASTA format and evaluates sequence conservation for every column.

USAGE:

./ap_MSAColumnConservation msa-file [sequence-id]

EXAMPLE:

./ap_MSAColumnConservation cyped.CYP109.aln M5R670_9BACI

where cyped.CYP109.aln is the name of input MSA file (.aln or .fasta format). If the sequence identifier is given as a second optional argument (here: M5R670_9BACI), program will attempt to find the sequence annotated with this name. When such a sequence is found, additional column will be added to provide residue for every position in that sequence (gaps are also shown).

Keywords:

Categories:

  • core::alignment::MSAColumnConservation

Input files:

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include <iostream>

#include <core/alignment/MSAColumnConservation.hh>
#include <core/data/io/clustalw_io.hh>
#include <utils/exit.hh>
#include <utils/io_utils.hh>
#include <core/data/io/fasta_io.hh>

std::string program_info = R"(

Reads a Multiple Sequence Alignment (MSA) in ClustalW or FASTA format and evaluates sequence conservation for every column.

USAGE:
./ap_MSAColumnConservation msa-file [sequence-id]

EXAMPLE:
./ap_MSAColumnConservation cyped.CYP109.aln M5R670_9BACI

where cyped.CYP109.aln is the name of input MSA file (.aln or .fasta format). If the sequence identifier
is given as a second optional argument (here: M5R670_9BACI), program will attempt to find the sequence
annotated with this name. When such a sequence is found, additional column will be added to provide residue
for every position in that sequence (gaps are also shown).

)";

/** @brief Reads a MSA in ClustalW format and evaluates sequence conservation for every column
 *
 * CATEGORIES: core::alignment::MSAColumnConservation
 * KEYWORDS: clustal input; MSA; FASTA input
 * GROUP:      Alignments
 */
int main(const int argc, const char* argv[]) {

  if(argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using namespace core::data::io;
  using namespace core::data::sequence;

  std::vector<Sequence_SP> msa;   // --- Sequence_SP is just a shorter name for std::shared_ptr<Sequence>
  const std::pair<std::string, std::string> name_ext = utils::root_extension(argv[1]);
  if((name_ext.second=="fasta")||(name_ext.second=="FASTA")||(name_ext.second=="fast"))
    core::data::io::read_fasta_file(argv[1], msa, true);
  else
    core::data::io::read_clustalw_file(argv[1],msa);

  std::string seq_str( msa[0]->length(),' ');
  std::string seq_name = (argc > 2) ? argv[2] : "";
  bool sequence_found = false;
  if (seq_name.size() > 0) {
    for (const auto &seq:msa)
      if (seq->header().find(argv[2]) != std::string::npos) {
        seq_str = seq->sequence;
        sequence_found = true;
      }
    if (!sequence_found) std::cerr << "Warning: the sequence >" << seq_name << "< can't be located!\n";
  }

  core::alignment::MSAColumnConservation consrv(msa);
  if (sequence_found)
    std::cout << "#pos  a  gaps   Shanon Relative Variation SumOfPairs JensenShannon\n";
  else
    std::cout << "#pos   gaps   Shanon Relative Variation SumOfPairs JensenShannon\n";

  for (size_t ipos = 0; ipos < msa[0]->length(); ++ipos)
    std::cout << utils::string_format("%4d %c %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", ipos, seq_str[ipos],
      consrv.evaluate(core::alignment::ColumnConservationScores::GapPercent, ipos),
      consrv.evaluate(core::alignment::ColumnConservationScores::ShannonEntropy, ipos),
      consrv.evaluate(core::alignment::ColumnConservationScores::RelativeEntropy, ipos),
      consrv.evaluate(core::alignment::ColumnConservationScores::Variation, ipos),
      consrv.evaluate(core::alignment::ColumnConservationScores::SumOfPairs, ipos),
      consrv.evaluate(core::alignment::ColumnConservationScores::JensenShannonDivergence, ipos));
}
../_images/file_icon.png