-
-
Save arnavm/039e76a34a386a4f29b82682bc8e6c72 to your computer and use it in GitHub Desktop.
Takes a k-mer and FASTA as input and outputs BED file containing all position within the FASTA for that k-mer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright 2019 Avinash Ramu & Arnav Moudgil | |
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
*/ | |
// Takes a kmer and fasta as input and lists all positions | |
// within the fasta for that kmer. | |
// Compilation: g++ kmer.cc -o kmer | |
// Example usage: ./kmer hg38.fa AT | |
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <algorithm> | |
using namespace std; | |
// Function that returns reverse complement of a DNA sequence | |
string reverseComplement(string seq) { | |
string rcseq = seq.substr(); // First copy seq | |
reverse(rcseq.begin(), rcseq.end()); // Reverse seq | |
for (std::string::size_type i = 0; i < rcseq.size(); i++) { | |
switch (rcseq[i]) { // Complement bases | |
case 'A': | |
rcseq[i] = 'T'; | |
break; | |
case 'C': | |
rcseq[i] = 'G'; | |
break; | |
case 'G': | |
rcseq[i] = 'C'; | |
break; | |
case 'T': | |
rcseq[i] = 'A'; | |
break; | |
// case 'N' is not explicitly handled as N's would remain N | |
} | |
} | |
return rcseq; | |
} | |
void printKmerEntry(string contig, string kmer, unsigned long position, unsigned long count, char strand, char delim) { | |
cout << contig << delim // chrom | |
<< position << delim // start | |
<< position + kmer.size() << delim // end | |
<< contig << '.' << kmer << '.' << count << delim // kmer count | |
<< '.' << delim // value (not used) | |
<< strand << endl; // strand | |
} | |
int main(int argc, char* argv[]) { | |
if (argc != 3) { | |
cerr << "Example usage: kmer hg38.fa AT"; | |
return 1; | |
} | |
string kmer = argv[2]; | |
// Convert to upper-case | |
std::transform(kmer.begin(), kmer.end(), kmer.begin(), ::toupper); | |
// Check if kmer is a palindrome | |
bool palindrome; | |
if (kmer == reverseComplement(kmer)) palindrome = true; | |
else palindrome = false; | |
// Read the FASTA | |
ifstream fasta_fh(argv[1]); | |
std::string line, contig, prev_line, query; | |
unsigned long position; // character position in entry | |
char delim = '\t', strand; // delimiter | |
unsigned long count; // count of how many hits we have | |
while(std::getline(fasta_fh, line).good()) { | |
if (line[0] == '>') { | |
istringstream iss(line.substr(1)); | |
iss >> contig; // Get the contig name | |
prev_line = ""; | |
position = 0; // Enables 0-indexing | |
count = 0; | |
continue; | |
} | |
// Convert line to upper-case for case-insensitive matching | |
std::transform(line.begin(), line.end(), line.begin(), ::toupper); | |
// Concatenate trailing part of previous line | |
line = prev_line + line; | |
// Maximum index to scan along line | |
long maxIndex = line.size() - (kmer.size() - 1); | |
if (maxIndex > 0) { // Line is longer than kmer | |
for (std::string::size_type i = 0; i < maxIndex; i++) { | |
// Get a kmer-sized substring | |
query = line.substr(i, kmer.size()); | |
// Check if query string matches kmer | |
if (query == kmer) { | |
strand = '+'; | |
count++; | |
printKmerEntry(contig, kmer, position, count, strand, delim); | |
} | |
// Check reverse complement if kmer is not palindrome | |
if (!palindrome && reverseComplement(query) == kmer) { | |
strand = '-'; | |
count++; | |
printKmerEntry(contig, kmer, position, count, strand, delim); | |
} | |
position++; | |
} | |
prev_line = line.substr(line.size() - (kmer.size() - 1), kmer.size() - 1); | |
} | |
else { // kmer is longer than line | |
prev_line = line.substr(); | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment