#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#define BUFFER_CAPACITY 1024 * 1024 * 25
#define INDICES_EXPECTED 32
#define INDICES_MAXIMUM 40
#define INDICES_WRITE 16
#define PRINT_FREQUENCY 16
int main (int argc, char* argv[])
{
unsigned char* bufferInput;
unsigned char* bufferLast;
unsigned char* bufferPrev;
unsigned long bufferSize;
unsigned char* bufferSwap;
unsigned long indicesArray[INDICES_MAXIMUM + 1];
unsigned long indicesCount;
unsigned long indicesWrite[INDICES_WRITE] = {0, 1, 14, 7, 8, 16, 10, 12, 13, 11, 9, 25, 26, 27, 17, 18};
FILE* streamWriter;
FILE* streamReader;
unsigned long block;
unsigned long dupes;
unsigned long lines;
unsigned long size;
unsigned long i;
unsigned long j;
if (argc < 3)
{
printf ("usage: %s <input> <output>\n", argv[0]);
return 0;
}
streamReader = fopen (argv[1], "rb");
if (!streamReader)
{
fprintf (stderr, "cannot open input file \"%s\" for reading\n", argv[1]);
return 1;
}
streamWriter = fopen (argv[2], "wb");
if (!streamWriter)
{
fprintf (stderr, "cannot open output file \"%s\" for writing\n", argv[2]);
fclose (streamReader);
return 1;
}
bufferInput = malloc (sizeof (*bufferInput) * BUFFER_CAPACITY * 3);
bufferLast = bufferInput + BUFFER_CAPACITY;
bufferPrev = bufferLast + BUFFER_CAPACITY;
bufferSize = 0;
indicesArray[0] = 0;
indicesCount = 1;
dupes = 0;
lines = 0;
printf ("starting process...\n");
for (i = 0; i < bufferSize || !feof (streamReader); ++i)
{
// End of buffer reached: content must be shifted to the left before
// buffer is populated with incoming data from input stream
if (i == bufferSize)
{
i = bufferSize - indicesArray[0];
memmove (bufferInput, bufferInput + indicesArray[0], i * sizeof (*bufferInput));
bufferSize = i + fread (bufferInput + i, sizeof (*bufferInput), BUFFER_CAPACITY - i, streamReader);
for (j = indicesCount; j--; )
indicesArray[j] -= indicesArray[0];
if (i >= bufferSize)
break;
}
// End of item found: save starting index of the next one
if (bufferInput[i] == '|')
{
if (indicesCount < INDICES_MAXIMUM)
indicesArray[indicesCount++] = i + 1;
}
// End of line found: write required indices and flush list
else if (bufferInput[i] < ' ')
{
if (indicesCount == INDICES_EXPECTED)
{
indicesArray[indicesCount] = i + 1;
size = 0;
for (j = 0; j + 1 < INDICES_WRITE; ++j)
{
block = indicesArray[indicesWrite[j] + 1] - indicesArray[indicesWrite[j]] - 1;
memcpy (bufferLast + size, bufferInput + indicesArray[indicesWrite[j]], block * sizeof (*bufferInput));
size += block;
bufferLast[size++] = '|';
}
if (j < INDICES_WRITE)
{
block = indicesArray[indicesWrite[j] + 1] - indicesArray[indicesWrite[j]] - 1;
memcpy (bufferLast + size, bufferInput + indicesArray[indicesWrite[j]], block * sizeof (*bufferInput));
size += block;
bufferLast[size++] = 0;
}
if (size > 0)
{
if (memcmp (bufferLast, bufferPrev, size * sizeof (*bufferLast)) != 0)
{
if ((++lines & (((unsigned)1 << PRINT_FREQUENCY) - 1)) == 0)
printf ("writing line %lu (%lu duplicate(s))...\n", lines, dupes);
fwrite (bufferLast, sizeof (*bufferLast), size - 1, streamWriter);
bufferSwap = bufferLast;
bufferLast = bufferPrev;
bufferPrev = bufferSwap;
}
else
++dupes;
}
}
fwrite (bufferInput + i, sizeof (*bufferInput), 1, streamWriter);
indicesArray[0] = i + 1;
indicesCount = 1;
}
}
if ((lines & (((unsigned)1 << PRINT_FREQUENCY) - 1)) != 0)
printf ("writing line %lu (%lu duplicate(s))...\n", lines, dupes);
printf ("done.\n");
fclose (streamWriter);
fclose (streamReader);
free (bufferInput);
return 0;
}