ex_split_fasta

ex_split_fasta reads a FASTA file and writes every sequence from it in a separate file

USAGE:
./ex_split_fasta 5edw.fasta

Categories:

  • core/data/io/fasta_io.hh

Input files:

Program source:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#include <iostream>

#include <core/data/io/fasta_io.hh>
#include <utils/string_utils.hh>
#include <utils/exit.hh>

std::string program_info = R"(

ex_split_fasta reads a FASTA file and writes every sequence from it in a separate file
USAGE:
    ./ex_split_fasta 5edw.fasta

)";

/** @brief Reads a file with sequences in FASTA format and writes each sequence to a separate FASTA file.
 *
 * CATEGORIES: core/data/io/fasta_io.hh;
 * KEYWORDS:   FASTA input; FASTA output; sequence; FASTA; pre-processing
 */
int main(const int argc, const char *argv[]) {


  if(argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using core::data::sequence::Sequence_SP; // --- Sequence_SP is just a std::shared_ptr to core::data::sequence::Sequence type
  using namespace core::data::io;          // --- for FASTA I/O

  // --- Create a container where the sequences will be stored
  std::vector<Sequence_SP> sequences;

  // --- Read a file with FASTA sequences
  core::data::io::read_fasta_file(argv[1], sequences);

  // --- Write them in separate FASTA files
  for (const Sequence_SP s : sequences) {
    std::string header = s->header();
    std::replace(header.begin(), header.end(), '|', ' '); // --- fix ncbi-style header in FASTA files
    auto words = utils::split(header, ' '); // --- We take the very first word of the FASTA as a file name; hopefully it is sth meaningful, e.g. a gene name
    std::ofstream out(words[0] + ".fasta");
    out << "> " << s->header() << "\n" << s->sequence << "\n";
    out.close();
  }
}
../_images/file_icon.png