Antonie
refgenome.hh
Go to the documentation of this file.
00001 #pragma once
00002 #include <string>
00003 #include <vector>
00004 #include <unordered_map>
00005 #include <forward_list>
00006 #include <map>
00007 #include "antonie.hh"
00008 #include "fastq.hh"
00009 
00010 using std::string;
00011 using std::vector;
00012 using std::unordered_map;
00013 using std::map;
00014 using std::forward_list; 
00015 
00017 struct FASTQMapping
00018 {
00019   uint64_t pos;
00020   bool reverse;
00021   int indel; // 0 = nothing, >0 means WE have an insert versus reference at pos
00022              // <0 means WE have a delete versus reference at pos
00023 };
00024 
00026 struct GenomeLocusMapping
00027 {
00028   GenomeLocusMapping() : coverage(0) {}
00029   forward_list<FASTQMapping> d_fastqs;
00030   unsigned int coverage;
00031 };
00032 
00033 
00035 struct Unmatched
00036 {
00037   string left, unmatched, right;
00038   dnapos_t pos;
00039 };
00040 
00042 class ReferenceGenome
00043 {
00044 public:
00045   ReferenceGenome(const string& fname); 
00046   dnapos_t size() const {
00047     return d_genome.size() - 1; // we pad at the beginning so we are 1 based..
00048   }
00049   vector<uint32_t> getMatchingHashes(const vector<uint32_t>& hashes);
00050 
00052   struct MatchDescriptor
00053   {
00054     dnapos_t pos;
00055     bool reverse;
00056     int score;
00057   };
00058   void mapFastQ(dnapos_t pos, const FastQRead& fqfrag, int indel=0);
00059   void cover(dnapos_t pos, char quality, int limit);
00060   void cover(dnapos_t pos, unsigned int length, const std::string& quality, int limit) ;
00061   vector<MatchDescriptor> getAllReadPosBoth(FastQRead* fq); // tries original & complement
00062   dnapos_t getReadPosBoth(FastQRead* fq, int qlimit); // tries original & complement
00063   vector<dnapos_t> getReadPositions(const std::string& nucleotides);
00064 
00065   vector<dnapos_t> getGCHisto();
00066   string snippet(dnapos_t start, dnapos_t stop) const;
00067 
00068   void printCoverage(FILE* jsfp, const std::string& fname);
00069   void index(unsigned int length);
00070 
00071   string getMatchingFastQs(dnapos_t pos, StereoFASTQReader& fastq); 
00072   string getMatchingFastQs(dnapos_t start, dnapos_t stop,  StereoFASTQReader& fastq); 
00073   vector<GenomeLocusMapping> d_mapping;
00074   vector<unsigned int> d_correctMappings, d_wrongMappings, d_gcMappings, d_taMappings;
00075   vector<vector<uint32_t>> d_kmerMappings;
00076   vector<Unmatched> d_unmRegions;
00078   struct LociStats
00079   {
00081     struct Difference
00082     {
00083       char nucleotide;
00084       char quality;
00085       bool headOrTail;
00086       bool operator<(const Difference& b) const
00087       {
00088         return std::tie(nucleotide, quality) < std::tie(b.nucleotide, b.quality);
00089       }
00090     };
00091     vector<Difference> samples; 
00092   };
00093   dnapos_t d_aCount, d_cCount, d_gCount, d_tCount;
00094   typedef unordered_map<dnapos_t, LociStats> locimap_t;
00095   locimap_t d_locimap;
00096   unordered_map<dnapos_t, unsigned int> d_insertCounts;
00097   string d_name;
00098 
00099 private:
00100   string d_genome;
00101   struct HashPos {
00102     HashPos(uint32_t hash_, dnapos_t pos) : d_hash(hash_), d_pos(pos)
00103     {}
00104     HashPos(){}
00105     uint32_t d_hash;
00106     dnapos_t d_pos;
00107     
00108     bool operator<(const HashPos& rhs) const 
00109     {
00110       return d_hash < rhs.d_hash;
00111     }
00112   };
00113 
00114   typedef vector<HashPos> index_t;
00115   map<int, index_t> d_indexes;
00116 };
 All Classes Files Functions Variables Typedefs Friends Defines