ap_filter_fasta

ap_find_in_fasta reads a file in FASTA format and prints only these sequences which satisfy the following filters: - sequence must a protein - sequence must not be shorter than 15 aa - sequence must contain at most 10 UNK residues The output sequences are sorted.

USAGE:
ap_filter_fasta input.fasta [input2.fasta ...]

Keywords:

Categories:

  • core/data/io/fasta_io

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#include <iostream>

#include <core/algorithms/UnionFind.hh>
#include <core/data/io/fasta_io.hh>
#include <utils/exit.hh>
#include <utils/io_utils.hh>

std::string program_info = R"(

ap_find_in_fasta reads a file in FASTA format and prints only these sequences which satisfy the following filters:
  - sequence must a protein
  - sequence must not be shorter than 15 aa
  - sequence must contain at most 10 UNK residues
The output sequences are sorted.
USAGE:
    ap_filter_fasta input.fasta [input2.fasta ...]

)";

/** @brief This program reads a file with sequences in FASTA format and sorts them by length.
 * DNA sequences, sequences that are shorter than 15 residues and those having more than 10 Xs are removed
 *
 * CATEGORIES: core/data/io/fasta_io;
 * KEYWORDS:   FASTA input; FASTA output; sequence; FASTA pre-processing; sequence filters
 */
int main(const int argc, const char* argv[]) {

  if (argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  // --- Sequence_SP is just a std::shared_ptr to core::data::sequence::Sequence type
  using core::data::sequence::Sequence_SP;

  // --- Create a container where the sequences will be stored
  std::vector<Sequence_SP> sequences;

  // --- Read a file (or files) with FASTA sequences; sequences are appended to the given vector
  for (int i = 1; i < argc; ++i) core::data::io::read_fasta_file(argv[i], sequences);

  // --- Remove sequences that do not come from proteins
  sequences.erase(std::remove_if(sequences.begin(),sequences.end(),
      [](const Sequence_SP s){ return !s->is_protein_sequence;}),sequences.end());

  // --- Remove sequences that are too short
  sequences.erase(std::remove_if(sequences.begin(),sequences.end(),
      [](const Sequence_SP s){ return s->length()<20;}),sequences.end());

  // --- Remove sequences that contain 10 or more 'X' characters (i.e. unknown amino acids)
  sequences.erase(std::remove_if(sequences.begin(),sequences.end(),
      [](const Sequence_SP s){ return std::count(s->sequence.begin(), s->sequence.end(), 'X')>10;}),sequences.end());

  // --- Now, sort the sequences by length
  std::sort(sequences.begin(),sequences.end(),
      [](const Sequence_SP si,const Sequence_SP sj){ return si->length()<sj->length();});

  // --- Remove duplicates
  core::algorithms::UnionFind<Sequence_SP, core::index4> uf;
  uf.add_element(sequences[0]);
  for (size_t i = 1; i < sequences.size(); ++i) {
    uf.add_element(sequences[i]);
    for (int j = i - 1; j >= 0; --j) {
      if (sequences[i]->length() - sequences[j]->length() > 20) break;
      if (sequences[i]->sequence.find(sequences[j]->sequence) != std::string::npos) uf.union_set(i, j);
    }
  }
  for (size_t i = 0; i < sequences.size(); ++i) {
    core::index4 set_id = uf.find_set(i);
    if (set_id != i) {
      std::string new_header = sequences[set_id]->header() + " " + sequences[i]->header();
      sequences[set_id]->header(new_header);
    }
  }

  // --- Print sequences to stdout
  for (size_t i = 0; i < sequences.size(); ++i) {
    if (uf.find_set(i) == i) std::cout << core::data::io::create_fasta_string(*sequences[i]) << "\n";
  }
}
../_images/file_icon.png