Time for the file-based merge sort code promised long ago. BSD licensed.
Yes, I know it’s way too verbose, quite inefficient and probably pretty ugly but hey it’s just a general idea for you to improve and adjust to your specific needs. It’s intentionally this clear 😉 It can sort things you wouldn’t be able to sort in-memory anyway 😉
In contrast to technical rantings, purely algorithmic insights are welcome. I would also appreciate if someone pointed me to another implementation of file-based merge sort because I was unable to find one on the web that easily. Perhaps I was looking in all the wrong places… I’m just curious.
#include <vector>
#include <algorithm>
#include <string>
using namespace std;
template<typename Record> class MergeSort
{
enum BufferState
{
Empty = 0,
Full,
Eof
};
int idxOfMin(const vector<pair<Record, BufferState> > &buffers)
{
int ret(-1);
for (int i = 0; i < buffers.size(); i++)
{
if (buffers[i].second == Full) { ret = i; break; }
}
for (int i = ret + 1; i < buffers.size(); i++)
{
if (buffers[i].second == Full && buffers[i].first < buffers[ret].first) ret = i;
}
return ret;
}
public:
// blockSize determines number of _elements_ (not bytes) a partial file can contain
int sort(const string &inFileName, const string &outFileName, size_t blockSize)
{
FILE *ofile = 0;
FILE *ifile = fopen(inFileName.c_str(), "rb");
if (!ifile) return -1;
int ret = 0;
vector<pair<string, FILE*> > tmpFiles;
vector<pair<Record, BufferState> > buffers;
vector<Record> *data = new vector<Record>(blockSize);
if (!data) { ret = -2; goto exit; }
size_t bytesRead;
while ((bytesRead = fread(&((*data)[0]), 1, blockSize * sizeof(Record), ifile)) > 0)
{
data->resize(bytesRead / sizeof(Record));
std::sort(data->begin(), data->end());
string tmpFileName(tempnam(0, "merge_"));
FILE *ofile = fopen(tmpFileName.c_str(), "w+b");
if (!ofile)
{
ret = -3;
goto exit;
}
if (fwrite(&((*data)[0]), 1, bytesRead, ofile) != bytesRead)
{
ret = -4;
goto exit;
}
fseek(ofile, 0, SEEK_SET);
tmpFiles.push_back(pair<string, FILE*>(tmpFileName, ofile));
}
if (ferror(ifile))
{
ret = -5;
goto exit;
}
buffers.resize(tmpFiles.size());
ofile = fopen(outFileName.c_str(), "wb");
if (!ofile)
{
ret = -6;
goto exit;
}
while (true)
{
for (int i = 0; i < buffers.size(); i++)
{
if (buffers[i].second == Empty)
{
bytesRead = fread(&buffers[i].first, 1, sizeof(Record), tmpFiles[i].second);
if (bytesRead != sizeof(Record))
{
if (ferror(tmpFiles[i].second))
{
ret = -7;
goto exit;
}
buffers[i].second = Eof;
} else
{
buffers[i].second = Full;
}
}
}
int idx = idxOfMin(buffers);
if (idx == -1) break;
buffers[idx].second = Empty;
if (fwrite(&buffers[idx].first, 1, sizeof(Record), ofile) != sizeof(Record))
{
ret = -8;
goto exit;
}
}
exit:
if (ofile) fclose(ofile);
fclose(ifile);
for (int i = 0; i < tmpFiles.size(); i++)
{
fclose(tmpFiles[i].second);
unlink(tmpFiles[i].first.c_str());
}
delete data;
return ret;
}
};

Greetings 🙂
Some pointers:
http://stxxl.sourceforge.net/ (Standard Template Library for Extra Large Data Sets)
http://cis.stvincent.edu/html/tutorials/swd/extsort/extsort.html
Wow, thanks Robert 🙂 This is going to be extremely useful. I’m pretty sure I’ve heard about stxxl before but I guess somehow it didn’t show up in the “file-based merge sort” results on Google. The second link is also interesting, the author mentions that he is merging two files at a time while I merge all the files in one run. It would be nice to polish the code and do some performance comparison.