Last active
July 13, 2016 21:23
-
-
Save engelmarkus/895a79811bfa0b3b4564f3833da31889 to your computer and use it in GitHub Desktop.
k-way merge sort - sorting data using a fixed amount of memory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <array> | |
#include <fstream> | |
#include <numeric> | |
#include <random> | |
using namespace std; | |
// Step 1 - Create a file with 256 MiB of data | |
array<uint64_t, 256 * 1024 * 1024 / sizeof(uint64_t)> data; | |
int main() { | |
iota(begin(data), end(data), 0); | |
random_device rd; | |
mt19937 gen(rd()); | |
shuffle(begin(data), end(data), gen); | |
ofstream file("data.dat"); | |
file.write(reinterpret_cast<char*>(data.data()), data.size() * sizeof(uint64_t)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <array> | |
#include <algorithm> | |
#include <fstream> | |
#include <limits> | |
#include <vector> | |
using namespace std; | |
// Step 3 - Merge chunks | |
constexpr size_t const ElemsPerBlock = 1024 * 1024; | |
uint64_t getElemAt(ifstream& ifs, size_t blockNum, size_t index) { | |
ifs.seekg((blockNum * ElemsPerBlock + index) * sizeof(uint64_t), ifs.beg); | |
uint64_t result; | |
ifs.read(reinterpret_cast<char*>(&result), sizeof(uint64_t)); | |
return result; | |
} | |
int main() { | |
ifstream din("data_sorted.dat"); | |
din.seekg(0, din.end); | |
auto length = din.tellg(); | |
din.seekg(0, din.beg); | |
ofstream dout("data_merged.dat"); | |
vector<uint64_t> data(length / sizeof(uint64_t) / ElemsPerBlock); | |
vector<uint64_t> indizes(length / sizeof(uint64_t) / ElemsPerBlock); | |
// init data | |
for (auto i = 0; i < data.size(); ++i) { | |
data[i] = getElemAt(din, i, 0); | |
} | |
auto to_write = length / sizeof(uint64_t); | |
auto written = 0; | |
while (written < to_write) { | |
// pick smallest element | |
auto minelem = min_element(begin(data), end(data)); | |
auto pos = distance(begin(data), minelem); | |
dout.write(reinterpret_cast<char*>(&*minelem), sizeof(uint64_t)); | |
written++; | |
indizes[pos]++; | |
data[pos] = (indizes[pos] >= ElemsPerBlock) ? numeric_limits<uint64_t>::max() : getElemAt(din, pos, indizes[pos]); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <array> | |
#include <algorithm> | |
#include <fstream> | |
using namespace std; | |
constexpr size_t const BufferSize = 1024 * 1024; | |
using Buffer = array<uint64_t, BufferSize>; | |
// Step 2 - Sort chunks of data using only 8 MiB of RAM | |
Buffer buf; | |
int main() { | |
ifstream din("data.dat"); | |
din.seekg(0, din.end); | |
auto length = din.tellg(); | |
din.seekg(0, din.beg); | |
ofstream dout("data_sorted.dat"); | |
for (auto i = 0; i < length; i += buf.size() * sizeof(Buffer::value_type)) { | |
din.read(reinterpret_cast<char*>(buf.data()), buf.size() * sizeof(Buffer::value_type)); | |
sort(begin(buf), end(buf)); | |
dout.write(reinterpret_cast<char*>(buf.data()), buf.size() * sizeof(Buffer::value_type)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment