Fast file Reading

Fast extraction of a relatively small amount of data from a large file is easy with J, using mapped files.

For example, extracting distinct ip addresses from a log file can be as follows:

J version: Extract distinct IP addresses following a label from a large file

findInFile=: 4 : 0
NB. find data in a file
NB. x: label preceding data
NB. y: the file name
NB. the data is followed by a blank space
        
        JCHAR map_jmf_ 'file';y         NB. mapped files realy speed things up
        nos=. x I.@:E. file             NB. find the positions of the label in file
        ip=.(nos+/(#x)+i.16){file       NB. matrix with maximum no. of columns
        unmap_jmf_ 'file'
        ~.({."0 1~  (i."1 &' '))ip      NB. extract data from each line until a blank is found
                                                                NB. return the unique values
)

On a small Acer Aspire One J took about 0.65 secs to extract 16000 ip addresses and the 10 distinct ip addresses from a 38 meg file.

This program could have been written in C, saving perhaps 0.30 secs, but with a bit more effort.

test=: 3 : 0
        file=:'testfile.2'
        out=.,(20000 2000$' '),.~' rhost=',"1(' '-.~"1(}:"1 (20000 16$,'.',"1~":>:?40 1$255)))
        out fwrite file
        label =.' rhost='
        label findInFile file
        ferase file
) 

C version: Extract distinct IP addresses following a label from a large file

//////////////////////////////////////////////////////
//                                                  //
//  Small C mmap() sample.                          //
//  Written by Martin Cyr.                          //
//  Feel free to change and distribute, but credit  //
//  is always nice. If you use, I'd be pleased to   //
//  hear from you at Spooles at GMail dot com.      //
//                                                  //
//////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/fcntl.h>

#define INITIAL_CAPACITY 25
#define IP_CHAR_LEN 16
#define LINE_CHAR_LEN 1024
#define PATTERN " rhost="

void showUsage();
int processFile(char*, char***);
int addNextHost(char***, int, int*, char*, int);
int countMatches(char*, char**, int);
void printDestroyArray(char**, int);

int main(int argc, char** argv)
{
        char** hosts;
        int hostCount;

        if (argc <= 1)
                showUsage(argv[0]);
        else if (argc == 2)
        {
                hostCount = processFile(argv[1], &hosts);
                printDestroyArray(hosts, hostCount);
                free(hosts);
        }
        else
                showUsage(argv[0]);
}

void showUsage(char* filename)
{
        printf("Usage: %s <filename>\n", filename);
        printf("\tParses the <filename> for occurences of rhost= \n");
        printf("\tand sends everything to stdout\n");
}

int countMatches(char* match, char** array, int count)
{
        int i, ret = 0;
        for (i = 0; i < count; i++)
        {
                if (strcmp(match, array[i]) == 0)
                        ret++;
        }
        return ret;     
}

void printDestroyArray(char** array, int count)
{
        int i;

        for (i = 0; i < count; i++)
        {
                printf("%s\n", array[i]);
                free(array[i]);
        }       
}

int processFile(char* filename, char*** hosts)
{
        int c;
        int hostCount = 0, hostMax = INITIAL_CAPACITY;
        int match = 0, patternLen = strlen(PATTERN);
        int fd;
        int result;
        int i;
        char* map;
        struct stat results;
        
        (*hosts) = (char**)malloc(hostMax * IP_CHAR_LEN * sizeof(char));

        fd = open(filename, O_RDONLY);
        if (fd == -1) 
        {
                perror("Error opening file");
                exit(EXIT_FAILURE);
        }

        if (stat(filename, &results) != 0)
        {
                perror("Unable to get file stats");
                exit(EXIT_FAILURE);
        }

        map = mmap(0, results.st_size, PROT_READ, MAP_PRIVATE, fd, 0);

        if (map == MAP_FAILED)
        {
                perror("Error mapping the file");
                exit(EXIT_FAILURE);
        }

        for (i = 0; i < results.st_size; i++)
        {
                if (map[i] != PATTERN[match++])
                        match = 0;

                if (match == patternLen)
                {
                        hostCount = addNextHost(hosts, hostCount, &hostMax, map, i+1);
                        match = 0;
                }
        }

        if (munmap(map, results.st_size) == -1) 
        {
                perror("Error unmapping the file");
        }

        close(fd);
        return hostCount;
}

int addNextHost(char*** hosts, int hostNum, int* hostMax, char* map, int offset)
{
        char host[IP_CHAR_LEN];
        int pos = 0;

        if (hostNum > *hostMax)
        {
                *hostMax *= 2;
                *hosts = (char**)realloc(*hosts, IP_CHAR_LEN * (*hostMax) * sizeof(char));
        }

        while ((map[offset+pos] != ' ') && (map[offset+pos] != '\n') && (map[offset+pos] != '\r') && (map[offset+pos] != '\t'))
        {
            
            host[pos] = map[offset+pos];
                pos++;
        }

        host[pos] = 0;

        if ((pos > 0) && (countMatches(host, *hosts, hostNum) == 0))
        {
                (*hosts)[hostNum] = (char*)calloc(IP_CHAR_LEN, sizeof(char));
                strncpy((*hosts)[hostNum], host, pos);
                hostNum++;
        }

        return hostNum;
}


CategoryWorkInProgress

Scripts/Fast File Read (last edited 2008-12-08 10:45:28 by )