Time for the file-based merge sort code promised long ago. BSD licensed.
Yes, I know it’s way too verbose, quite inefficient and probably pretty ugly but hey it’s just a general idea for you to improve and adjust to your specific needs. It’s intentionally this clear 😉 It can sort things you wouldn’t be able to sort in-memory anyway 😉
In contrast to technical rantings, purely algorithmic insights are welcome. I would also appreciate if someone pointed me to another implementation of file-based merge sort because I was unable to find one on the web that easily. Perhaps I was looking in all the wrong places… I’m just curious.
#include <vector> #include <algorithm> #include <string> using namespace std; template<typename Record> class MergeSort { enum BufferState { Empty = 0, Full, Eof }; int idxOfMin(const vector<pair<Record, BufferState> > &buffers) { int ret(-1); for (int i = 0; i < buffers.size(); i++) { if (buffers[i].second == Full) { ret = i; break; } } for (int i = ret + 1; i < buffers.size(); i++) { if (buffers[i].second == Full && buffers[i].first < buffers[ret].first) ret = i; } return ret; } public: // blockSize determines number of _elements_ (not bytes) a partial file can contain int sort(const string &inFileName, const string &outFileName, size_t blockSize) { FILE *ofile = 0; FILE *ifile = fopen(inFileName.c_str(), "rb"); if (!ifile) return -1; int ret = 0; vector<pair<string, FILE*> > tmpFiles; vector<pair<Record, BufferState> > buffers; vector<Record> *data = new vector<Record>(blockSize); if (!data) { ret = -2; goto exit; } size_t bytesRead; while ((bytesRead = fread(&((*data)[0]), 1, blockSize * sizeof(Record), ifile)) > 0) { data->resize(bytesRead / sizeof(Record)); std::sort(data->begin(), data->end()); string tmpFileName(tempnam(0, "merge_")); FILE *ofile = fopen(tmpFileName.c_str(), "w+b"); if (!ofile) { ret = -3; goto exit; } if (fwrite(&((*data)[0]), 1, bytesRead, ofile) != bytesRead) { ret = -4; goto exit; } fseek(ofile, 0, SEEK_SET); tmpFiles.push_back(pair<string, FILE*>(tmpFileName, ofile)); } if (ferror(ifile)) { ret = -5; goto exit; } buffers.resize(tmpFiles.size()); ofile = fopen(outFileName.c_str(), "wb"); if (!ofile) { ret = -6; goto exit; } while (true) { for (int i = 0; i < buffers.size(); i++) { if (buffers[i].second == Empty) { bytesRead = fread(&buffers[i].first, 1, sizeof(Record), tmpFiles[i].second); if (bytesRead != sizeof(Record)) { if (ferror(tmpFiles[i].second)) { ret = -7; goto exit; } buffers[i].second = Eof; } else { buffers[i].second = Full; } } } int idx = idxOfMin(buffers); if (idx == -1) break; buffers[idx].second = Empty; if (fwrite(&buffers[idx].first, 1, sizeof(Record), ofile) != sizeof(Record)) { ret = -8; goto exit; } } exit: if (ofile) fclose(ofile); fclose(ifile); for (int i = 0; i < tmpFiles.size(); i++) { fclose(tmpFiles[i].second); unlink(tmpFiles[i].first.c_str()); } delete data; return ret; } };
Greetings 🙂
Some pointers:
http://stxxl.sourceforge.net/ (Standard Template Library for Extra Large Data Sets)
http://cis.stvincent.edu/html/tutorials/swd/extsort/extsort.html
Wow, thanks Robert 🙂 This is going to be extremely useful. I’m pretty sure I’ve heard about stxxl before but I guess somehow it didn’t show up in the “file-based merge sort” results on Google. The second link is also interesting, the author mentions that he is merging two files at a time while I merge all the files in one run. It would be nice to polish the code and do some performance comparison.