ap_find_in_fastaΒΆ

Program reads a sequence database in FASTA format and a text file with sequence identifiers, and prints the requested sequences on the screen.

USAGE:

ap_find_in_fasta input.fasta seq_id_list.txt

EXAMPLE:

ap_find_in_fasta uniref90.fasta seq_id_list.txt
ap_find_in_fasta ferrodoxins.fasta selected_list.txt

Keywords:

Categories:

  • core/data/io/fasta_io.hh

Input files:

Output files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <iostream>

#include <core/data/io/fasta_io.hh>
#include <utils/exit.hh>
#include <utils/io_utils.hh>

std::string program_info = R"(

Program reads a sequence database in FASTA format and a text file with sequence identifiers, and prints
the requested sequences on the screen.

USAGE:
    ap_find_in_fasta input.fasta seq_id_list.txt
EXAMPLE:
    ap_find_in_fasta uniref90.fasta seq_id_list.txt
    ap_find_in_fasta ferrodoxins.fasta selected_list.txt

)";

bool is_good_sequence(const core::data::sequence::Sequence_SP seq, const std::vector<std::string>  & wanted_seq_id) {

  for(const std::string & s : wanted_seq_id) if(seq->header().find(s)!=std::string::npos) return true;

  return false;
}


/** @brief ap_find_in_fasta reads a sequence database in FASTA format and looks for sequences by given IDs
 *
 * CATEGORIES: core/data/io/fasta_io.hh;
 * KEYWORDS:   FASTA input; FASTA output; sequence; FASTA; pre-processing
 * GROUP:      File processing;Data filtering
 */
int main(const int argc, const char *argv[]) {

  if (argc < 3) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using core::data::sequence::Sequence_SP; // --- Sequence_SP is just a std::shared_ptr to core::data::sequence::Sequence type
  using namespace core::data::io;          // --- for FASTA I/O

  std::vector<std::string> wanted_seq_id;
  utils::read_listfile(argv[2], wanted_seq_id);
  std::vector<core::data::sequence::Sequence_SP> sink;

  // --- Read a file with FASTA sequences
  core::data::sequence::Sequence_SP seq = nullptr;
  std::ifstream infile;
  utils::in_stream(argv[1], infile);
  size_t n = 0;
  infile >> seq;
  while (seq != nullptr) {
    if (is_good_sequence(seq, wanted_seq_id)) std::cout << create_fasta_string(*seq) << '\n';
    if ((++n) % 10000 == 10000)  std::cerr << n << " sequences tested\n";
    infile >> seq;
  }
}
../_images/file_icon.png