Skip to content

Instantly share code, notes, and snippets.

@arnavm
Forked from gatoravi/2mer.cc
Last active October 10, 2019 04:48
Show Gist options
  • Save arnavm/039e76a34a386a4f29b82682bc8e6c72 to your computer and use it in GitHub Desktop.
Save arnavm/039e76a34a386a4f29b82682bc8e6c72 to your computer and use it in GitHub Desktop.
Takes a k-mer and FASTA as input and outputs BED file containing all position within the FASTA for that k-mer
/*
Copyright 2019 Avinash Ramu & Arnav Moudgil
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Takes a kmer and fasta as input and lists all positions
// within the fasta for that kmer.
// Compilation: g++ kmer.cc -o kmer
// Example usage: ./kmer hg38.fa AT
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
using namespace std;
// Function that returns reverse complement of a DNA sequence
string reverseComplement(string seq) {
string rcseq = seq.substr(); // First copy seq
reverse(rcseq.begin(), rcseq.end()); // Reverse seq
for (std::string::size_type i = 0; i < rcseq.size(); i++) {
switch (rcseq[i]) { // Complement bases
case 'A':
rcseq[i] = 'T';
break;
case 'C':
rcseq[i] = 'G';
break;
case 'G':
rcseq[i] = 'C';
break;
case 'T':
rcseq[i] = 'A';
break;
// case 'N' is not explicitly handled as N's would remain N
}
}
return rcseq;
}
void printKmerEntry(string contig, string kmer, unsigned long position, unsigned long count, char strand, char delim) {
cout << contig << delim // chrom
<< position << delim // start
<< position + kmer.size() << delim // end
<< contig << '.' << kmer << '.' << count << delim // kmer count
<< '.' << delim // value (not used)
<< strand << endl; // strand
}
int main(int argc, char* argv[]) {
if (argc != 3) {
cerr << "Example usage: kmer hg38.fa AT";
return 1;
}
string kmer = argv[2];
// Convert to upper-case
std::transform(kmer.begin(), kmer.end(), kmer.begin(), ::toupper);
// Check if kmer is a palindrome
bool palindrome;
if (kmer == reverseComplement(kmer)) palindrome = true;
else palindrome = false;
// Read the FASTA
ifstream fasta_fh(argv[1]);
std::string line, contig, prev_line, query;
unsigned long position; // character position in entry
char delim = '\t', strand; // delimiter
unsigned long count; // count of how many hits we have
while(std::getline(fasta_fh, line).good()) {
if (line[0] == '>') {
istringstream iss(line.substr(1));
iss >> contig; // Get the contig name
prev_line = "";
position = 0; // Enables 0-indexing
count = 0;
continue;
}
// Convert line to upper-case for case-insensitive matching
std::transform(line.begin(), line.end(), line.begin(), ::toupper);
// Concatenate trailing part of previous line
line = prev_line + line;
// Maximum index to scan along line
long maxIndex = line.size() - (kmer.size() - 1);
if (maxIndex > 0) { // Line is longer than kmer
for (std::string::size_type i = 0; i < maxIndex; i++) {
// Get a kmer-sized substring
query = line.substr(i, kmer.size());
// Check if query string matches kmer
if (query == kmer) {
strand = '+';
count++;
printKmerEntry(contig, kmer, position, count, strand, delim);
}
// Check reverse complement if kmer is not palindrome
if (!palindrome && reverseComplement(query) == kmer) {
strand = '-';
count++;
printKmerEntry(contig, kmer, position, count, strand, delim);
}
position++;
}
prev_line = line.substr(line.size() - (kmer.size() - 1), kmer.size() - 1);
}
else { // kmer is longer than line
prev_line = line.substr();
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment