ap_AAHydrophobicity

Reads a PDB file and substitutes b-factor column with hydrophobicity values according to Kyte-Doolittle scale.

If just a PDB file is given as an input, all b-factors will be replaced by respective KD hydrophobicity values. User can also provide a Multiple Sequence Alignment (MSA) in ClustalO format (.aln); hydrophobicity values will be averaged over a corresponding column of the MSA

USAGE:
ap_AAHydrophobicity 2gb1.pdb
ap_AAHydrophobicity 2gb1.pdb 2gb1.aln 2GB1

The sequence from the given PDB file must also be included in the alignment; its name is third argument of the program.

Categories:

  • core/chemical/AAHydrophobicity

Output files:

Program source:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include <iostream>
#include <iomanip>

#include <core/algorithms/predicates.hh>
#include <core/alignment/NWAligner.hh>
#include <core/alignment/scoring/SimilarityMatrix.hh>
#include <core/alignment/scoring/SimilarityMatrixScore.hh>
#include <core/alignment/scoring/NcbiSimilarityMatrixFactory.hh>
#include <core/chemical/AAHydrophobicity.hh>
#include <core/data/io/Pdb.hh>
#include <core/data/io/clustalw_io.hh>

#include <utils/exit.hh>
#include <core/data/structural/structure_selectors.hh>

std::string program_info = R"(

Reads a PDB file and substitutes b-factor column with hydrophobicity values according to Kyte-Doolittle scale.

If just a PDB file is given as an input, all b-factors will be replaced by respective KD hydrophobicity values.
User can also provide a Multiple Sequence Alignment (MSA) in ClustalO format (.aln); hydrophobicity values will be
averaged over a corresponding column of the MSA

USAGE:
    ap_AAHydrophobicity 2gb1.pdb
    ap_AAHydrophobicity 2gb1.pdb 2gb1.aln 2GB1

The sequence from the given PDB file must also be included in the alignment; its name is third argument of the program.

)";

/** @brief Reads a PDB file and substitutes b-factor column with hydrophobicity values according to Kyte-Doolittle scale. This example prints atoms for each side chain in a protein
 * 
 * CATEGORIES: core/chemical/AAHydrophobicity;
 * KEYWORDS:   PDB input; hydrophobicity; structure selectors; PDB line filter; sequence alignment; MSA input
 */
int main(const int argc, const char *argv[]) {

  if (argc < 2) utils::exit_OK_with_message(program_info); // --- complain about missing program parameter

  using namespace core::data::io; // is_not_alternative, Pdb and read_clustalw_file() are from this namespace
  using namespace core::data::sequence;
  using namespace core::data::structural; // Structure and Residue come from here

  using namespace core::alignment::scoring;

  Pdb reader(argv[1], all_true(is_not_alternative, is_not_water));
  Structure_SP strctr = reader.create_structure(0); // create a Structure object from the first model found in the input file
  Chain & first_chain = *(*strctr)[0]; // --- We assume the first chain is the one used in MSA
  first_chain.erase(std::remove_if(first_chain.begin(), first_chain.end(), core::algorithms::Not<IsAA>(IsAA())), first_chain.end());

  std::vector<core::real> kd_values;
  const core::chemical::AAHydrophobicity &kd_scale = core::chemical::AAHydrophobicity::KyteDoolittle;

  std::ofstream out("out.pdb");
  // ---------- The case when we have both a PDB file and a multiple sewuence alignment (.aln file)
  if (argc ==4) {
    std::vector<Sequence_SP> msa;   // --- placeholder for aligned sequences
    core::data::io::read_clustalw_file(argv[2],msa); // --- read the MSA and store sequences in a vector

    // ---------- Find the reference sequence in the alignment
    std::string ref_sequence_name(argv[3]); // --- the name of the sequence
    auto s = std::find_if(msa.begin(), msa.end(),
      [&ref_sequence_name](Sequence_SP s) { return s->header().find(ref_sequence_name) != std::string::npos; });
    if (s == msa.end())
      utils::exit_OK_with_message(
        "Can't find the reference sequence in the given MSA. Is the name correct: " + ref_sequence_name);
    Sequence_SP ref_sequence = *s;

    // --- Create a sequence object for the first chain of the PDB deposit
    core::data::sequence::SecondaryStructure_SP pdb_seq = first_chain.create_sequence();

    // ---------- we have to align the reference sequence with the sequence found in the given PDB file
    // ---------- as they might differ; we set PDB sequence to be a query and the reference - as a template
    unsigned max_len = std::max(pdb_seq->length(), ref_sequence->length());
    core::alignment::NWAligner<short, SimilarityMatrixScore<short>> aligner(max_len);
    NcbiSimilarityMatrix_SP sim_m = NcbiSimilarityMatrixFactory::get().get_matrix("BLOSUM62");
    SimilarityMatrixScore<short> score(pdb_seq->sequence, ref_sequence->sequence, *sim_m);
    aligner.align(-10, -1, score);
    auto alignment = aligner.backtrace();

    std::cout << "#msa_col aa_col aa res_id : avg_KD n_aa\n";
    // ---------- Iterate over all columns of the MSA
    for (core::index2 i_res = 0; i_res < ref_sequence->length(); ++i_res) {
      if (ref_sequence->get_monomer(i_res).is_gap()) continue;
      int j = alignment->which_query_for_template(i_res); // --- -1 denotes a gap, otherwise the index is non-negative
      if (j < 0) continue;
      core::real avg_kd = 0;
      core::real n = 0;
      for (Sequence_SP si:msa) {
        if (!si->get_monomer(i_res).is_gap()) {
          avg_kd += kd_scale.hydrophobicity(si->get_monomer(i_res));
          ++n;
        }
      }
      std::cout
        << utils::string_format("%4d %4d %c %4d : %5.2f %3d\n", i_res, j, first_chain[j]->residue_type().code1,
          first_chain[j]->id(), avg_kd / n, int(n));
      avg_kd = avg_kd / n + 5.0; // --- we add 5.0 because KD scale is from -4.5 to 4.5 and b-factor can't be negative
      for (const PdbAtom_SP &a : *(first_chain[j])) {
        a->b_factor(avg_kd);
        out << a->to_pdb_line() << "\n";
      }
    }

    return 0;
  }

  // ---------- The case when we have only PDB file : iIterate over all residues in the structure
  for (auto res_it = strctr->first_residue(); res_it != strctr->last_residue(); ++res_it) {
    core::real val = kd_scale.hydrophobicity((*res_it)->residue_type()) + 5.0;

    for (const PdbAtom_SP &a : **res_it) {
      a->b_factor(val);
      out << a->to_pdb_line() << "\n";
    }
  }
  out.close();
}
../_images/file_icon.png