Created
October 15, 2016 07:45
-
-
Save khvorov/f00a6c9996208d4d55f5ff5e73330832 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <fstream> | |
#include <iostream> | |
#include <set> | |
#include <string> | |
#include <unordered_set> | |
int main(int argc, char * argv[]) | |
{ | |
if (argc != 2) | |
{ | |
std::cerr << "usage: " << argv[0] << " <filename>\n"; | |
return -1; | |
} | |
// open a file | |
std::ifstream in(argv[1]); | |
if (!in.is_open()) | |
{ | |
std::cerr << "failed to open a file: " << argv[1] << "\n"; | |
return -1; | |
} | |
uint64_t count = 0; | |
// implementation based on associative containers: | |
// set ~ 8 sec | |
// unorderd_set ~ 4 sec | |
std::set<std::string> s; | |
//std::unordered_set<std::string> s; | |
std::string line; | |
while (std::getline(in, line)) | |
{ | |
if (!s.emplace(line).second) | |
++count; | |
} | |
std::cout << "found " << count << " duplicates\n"; | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sys/mman.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <iostream> | |
#include <vector> | |
#define handle_error(msg) do { perror(msg); exit(EXIT_FAILURE); } while(0) | |
int main(int argc, char * argv[]) | |
{ | |
if (argc != 2) | |
{ | |
std::cerr << "usage: " << argv[0] << " <filename>\n"; | |
return -1; | |
} | |
// open a file | |
int fd = open(argv[1], O_RDWR); | |
if (fd == -1) | |
handle_error("open"); | |
// get file size | |
struct stat sb; | |
if (fstat(fd, &sb) == -1) | |
handle_error("fstat"); | |
char * addr = (char *) mmap(nullptr, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); | |
if (addr == MAP_FAILED) | |
handle_error("mmap"); | |
uint64_t count = 0; | |
// implementation based on vector<bool> ~ 0.6 sec | |
// vector<int> ~ 1.1 sec | |
// with stdio readline ~ 0.45 sec | |
// custom impl of bit vector ~ 0.45 sec | |
// with mmap ~ 0.119 sec | |
using bits_type = uint64_t; | |
constexpr std::size_t cBits = sizeof(bits_type) * 8; | |
constexpr std::size_t cContSize = 26 * 26 * 26 * 10 * 10 * 10; | |
std::vector<bits_type> cont(cContSize / cBits + 1, 0); | |
for (char * line = addr; line < addr + sb.st_size; ++line) | |
{ | |
std::size_t pos = 0; | |
pos += 26L * 26 * (*line++ - 'A'); | |
pos += 26L * (*line++ - 'A'); | |
pos += (*line++ - 'A'); | |
pos *= 1000; | |
pos += 100L * (*line++ - '0'); | |
pos += 10L * (*line++ - '0'); | |
pos += (*line++ - '0'); | |
// try to eliminate conditions | |
const std::size_t shift = pos % cBits; | |
pos /= cBits; | |
count += (cont[pos] >> shift) & 0x01; | |
cont[pos] |= (1LL << shift); | |
} | |
std::cout << "found " << count << " duplicates\n"; | |
munmap(addr, sb.st_size); | |
close(fd); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
find_dup.cpp is a non-optimized version.
Compile: g++ -Wall -pedantic -std=c++14 -O3 -march=native -o find_dup_opt{,.cpp}
File format as follows:
$ head tmp/Rgn01.txt
JXR036
AYW849
LCX483
WAT514
JAY047
TGC100
WDN986
...