File-based merge sort

Time for the file-based merge sort code promised long ago. BSD licensed.

Yes, I know it’s way too verbose, quite inefficient and probably pretty ugly but hey it’s just a general idea for you to improve and adjust to your specific needs. It’s intentionally this clear 😉 It can sort things you wouldn’t be able to sort in-memory anyway 😉

In contrast to technical rantings, purely algorithmic insights are welcome. I would also appreciate if someone pointed me to another implementation of file-based merge sort because I was unable to find one on the web that easily. Perhaps I was looking in all the wrong places… I’m just curious.

#include <vector>
#include <algorithm>
#include <string>

using namespace std;

template<typename Record> class MergeSort
{
	enum BufferState
	{
		Empty = 0,
		Full,
		Eof
	};

	int idxOfMin(const vector<pair<Record, BufferState> > &buffers)
	{
		int ret(-1);

		for (int i = 0; i < buffers.size(); i++)
		{
			if (buffers[i].second == Full) { ret = i; break; }
		}

		for (int i = ret + 1; i < buffers.size(); i++)
		{
			if (buffers[i].second == Full && buffers[i].first < buffers[ret].first) ret = i;
		}

		return ret;
	}

public:
	// blockSize determines number of _elements_ (not bytes) a partial file can contain
	int sort(const string &inFileName, const string &outFileName, size_t blockSize)
	{
		FILE *ofile = 0;
		FILE *ifile = fopen(inFileName.c_str(), "rb");
		if (!ifile) return -1;

		int ret = 0;
		vector<pair<string, FILE*> > tmpFiles;
		vector<pair<Record, BufferState> > buffers;

		vector<Record> *data = new vector<Record>(blockSize);
		if (!data) { ret = -2; goto exit; }

		size_t bytesRead;
		while ((bytesRead = fread(&((*data)[0]), 1, blockSize * sizeof(Record), ifile)) > 0)
		{
			data->resize(bytesRead / sizeof(Record));
			std::sort(data->begin(), data->end());

			string tmpFileName(tempnam(0, "merge_"));
			FILE *ofile = fopen(tmpFileName.c_str(), "w+b");
			if (!ofile)
			{
				ret = -3;
				goto exit;
			}
			if (fwrite(&((*data)[0]), 1, bytesRead, ofile) != bytesRead)
			{
				ret = -4;
				goto exit;
			}
			fseek(ofile, 0, SEEK_SET);
			tmpFiles.push_back(pair<string, FILE*>(tmpFileName, ofile));
		}

		if (ferror(ifile))
		{
			ret = -5;
			goto exit;
		}

		buffers.resize(tmpFiles.size());

		ofile = fopen(outFileName.c_str(), "wb");
		if (!ofile)
		{
			ret = -6;
			goto exit;
		}

		while (true)
		{
			for (int i = 0; i < buffers.size(); i++)
			{
				if (buffers[i].second == Empty)
				{
					bytesRead = fread(&buffers[i].first, 1, sizeof(Record), tmpFiles[i].second);
					if (bytesRead != sizeof(Record))
					{
						if (ferror(tmpFiles[i].second))
						{
							ret = -7;
							goto exit;
						}
						buffers[i].second = Eof;
					} else
					{
						buffers[i].second = Full;
					}
				}
			}

			int idx = idxOfMin(buffers);

			if (idx == -1) break;

			buffers[idx].second = Empty;

			if (fwrite(&buffers[idx].first, 1, sizeof(Record), ofile) != sizeof(Record))
			{
				ret = -8;
				goto exit;
			}
		}

	exit:
		if (ofile) fclose(ofile);
		fclose(ifile);
		for (int i = 0; i < tmpFiles.size(); i++)
		{
			fclose(tmpFiles[i].second);
			unlink(tmpFiles[i].first.c_str());
		}

		delete data;

		return ret;
	}
};

2 Comments

Robert says:

October 11, 2010 at 10:22 pm

Greetings 🙂
Some pointers:
http://stxxl.sourceforge.net/ (Standard Template Library for Extra Large Data Sets)
http://cis.stvincent.edu/html/tutorials/swd/extsort/extsort.html

admin says:

October 12, 2010 at 9:59 am

Wow, thanks Robert 🙂 This is going to be extremely useful. I’m pretty sure I’ve heard about stxxl before but I guess somehow it didn’t show up in the “file-based merge sort” results on Google. The second link is also interesting, the author mentions that he is merging two files at a time while I merge all the files in one run. It would be nice to polish the code and do some performance comparison.

2 Comments

Leave a Reply Cancel reply