Antonie
|
00001 #pragma once 00002 #include <string> 00003 #include <vector> 00004 #include <unordered_map> 00005 #include <forward_list> 00006 #include <map> 00007 #include "antonie.hh" 00008 #include "fastq.hh" 00009 00010 using std::string; 00011 using std::vector; 00012 using std::unordered_map; 00013 using std::map; 00014 using std::forward_list; 00015 00017 struct FASTQMapping 00018 { 00019 uint64_t pos; 00020 bool reverse; 00021 int indel; // 0 = nothing, >0 means WE have an insert versus reference at pos 00022 // <0 means WE have a delete versus reference at pos 00023 }; 00024 00026 struct GenomeLocusMapping 00027 { 00028 GenomeLocusMapping() : coverage(0) {} 00029 forward_list<FASTQMapping> d_fastqs; 00030 unsigned int coverage; 00031 }; 00032 00033 00035 struct Unmatched 00036 { 00037 string left, unmatched, right; 00038 dnapos_t pos; 00039 }; 00040 00042 class ReferenceGenome 00043 { 00044 public: 00045 ReferenceGenome(const string& fname); 00046 dnapos_t size() const { 00047 return d_genome.size() - 1; // we pad at the beginning so we are 1 based.. 00048 } 00049 vector<uint32_t> getMatchingHashes(const vector<uint32_t>& hashes); 00050 00052 struct MatchDescriptor 00053 { 00054 dnapos_t pos; 00055 bool reverse; 00056 int score; 00057 }; 00058 void mapFastQ(dnapos_t pos, const FastQRead& fqfrag, int indel=0); 00059 void cover(dnapos_t pos, char quality, int limit); 00060 void cover(dnapos_t pos, unsigned int length, const std::string& quality, int limit) ; 00061 vector<MatchDescriptor> getAllReadPosBoth(FastQRead* fq); // tries original & complement 00062 dnapos_t getReadPosBoth(FastQRead* fq, int qlimit); // tries original & complement 00063 vector<dnapos_t> getReadPositions(const std::string& nucleotides); 00064 00065 vector<dnapos_t> getGCHisto(); 00066 string snippet(dnapos_t start, dnapos_t stop) const; 00067 00068 void printCoverage(FILE* jsfp, const std::string& fname); 00069 void index(unsigned int length); 00070 00071 string getMatchingFastQs(dnapos_t pos, StereoFASTQReader& fastq); 00072 string getMatchingFastQs(dnapos_t start, dnapos_t stop, StereoFASTQReader& fastq); 00073 vector<GenomeLocusMapping> d_mapping; 00074 vector<unsigned int> d_correctMappings, d_wrongMappings, d_gcMappings, d_taMappings; 00075 vector<vector<uint32_t>> d_kmerMappings; 00076 vector<Unmatched> d_unmRegions; 00078 struct LociStats 00079 { 00081 struct Difference 00082 { 00083 char nucleotide; 00084 char quality; 00085 bool headOrTail; 00086 bool operator<(const Difference& b) const 00087 { 00088 return std::tie(nucleotide, quality) < std::tie(b.nucleotide, b.quality); 00089 } 00090 }; 00091 vector<Difference> samples; 00092 }; 00093 dnapos_t d_aCount, d_cCount, d_gCount, d_tCount; 00094 typedef unordered_map<dnapos_t, LociStats> locimap_t; 00095 locimap_t d_locimap; 00096 unordered_map<dnapos_t, unsigned int> d_insertCounts; 00097 string d_name; 00098 00099 private: 00100 string d_genome; 00101 struct HashPos { 00102 HashPos(uint32_t hash_, dnapos_t pos) : d_hash(hash_), d_pos(pos) 00103 {} 00104 HashPos(){} 00105 uint32_t d_hash; 00106 dnapos_t d_pos; 00107 00108 bool operator<(const HashPos& rhs) const 00109 { 00110 return d_hash < rhs.d_hash; 00111 } 00112 }; 00113 00114 typedef vector<HashPos> index_t; 00115 map<int, index_t> d_indexes; 00116 };