/* $Id: mkary.c,v 1.3 1999/10/13 13:26:16 tatuo-y Exp $ */
/* Time-stamp: <99/10/13 21:55:20 tatuo-y> */
/*
 * suffix array ץ
 * ʸ: Kenneth W. Church, NLPRS '95  Invited Lecture
 */
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <assert.h>
#include <stdarg.h>

#define FNLEN 1000 /* ե̾Ĺ */
#define STR_CASH_SIZE 20

#define MODE_ON 1
#define MODE_OFF 0

#define EIGHTH_BIT_ON(CHAR) (0x80 & (unsigned char)(CHAR))

/* Heap ǻȤ */
typedef struct {
    long aryidx;
    long max;
    long size;
    char strbuf[STR_CASH_SIZE + 1]; /* å */
} BLOCK;


void open_array_file(char *ary_fname);
size_t open_text_file(char *fname);
void sort_array_file(char *ary_fname, long pointer_cnt);
long make_first_array_file(long);
void usage(void);

void insertheap(BLOCK** a, BLOCK* v);
void upheap(BLOCK** a, int k);
void downheap(BLOCK** a, int k);
BLOCK* hremove(BLOCK** a);


/*
 * global variables  
 */
BLOCK **bl; /* 990219 */
int size_of_heap = 0; /* 990219 */

char *text; /* оݤȤʤƥ */
long *suf; /* pointer(index)  */

int option_byline = MODE_OFF; /* ʸ˥ǥå */
char *delimitter = NULL; /* ԡñζڤ국 */
char *progname; /* program name */
int quiet_mode = MODE_OFF; /* åϤʤŤ⡼ɡ */
int comment_out_mode = MODE_OFF; /* #ǻϤޤԤϥȥ */
int no_sort_mode = MODE_OFF; /* Ȥʤ⡼ */
int sort_only_mode = MODE_OFF; /* Ȥʤ⡼ */
int bit_8_mode = MODE_ON; /* 2ХȰʸ⡼ */
int dict_mode = MODE_OFF;
int j_mode = MODE_OFF; /* ܸ'<'ˤǥåĥ⡼ 981115 */
int bunkatu_sort_mode = MODE_OFF; /* 990219 */
int little_endian_mode = MODE_OFF;

int number_of_block; /* 990219 ʬ֥å */

FILE *ofd = NULL; /* ե */


/****************************************************************************
 * print a message (using <stdarg.h>)
 ***************************************************************************/
void print_msg(int ok, char *fmt, ...)
{
    va_list args;

    if (!ok)
        return;
    fflush(stdout);
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
}


/****************************************************************************
 * functions for endian problem
 ***************************************************************************/
int is_little_endian(void)
{
    int  n = 1;
    char *c;
    
    c = (char *)&n;
    if (*c == 1) {
        return 1;
    } else {
        return 0;
    }
}

void reverse_byte_order (long *p, int n)
{
    int i, j;
    unsigned char *c, tmp;

    for (i = 0; i < n; i++) {
	c = (unsigned char*)(p + i);
	for (j = 0; j < (sizeof(long) / 2); j++) {
            tmp = *(c + j);
            *(c + j)= *(c + sizeof(long) - 1 - j);
            *(c + sizeof(long) - 1 - j) = tmp;
        }
    }
}


/****************************************************************************
 * compare suffixies
 ***************************************************************************/
int suffix_compare(long *a, long *b)
{
    return strcmp(text + *a, text + *b);
}

int two_level_compare(BLOCK *a, BLOCK *b)
{
    int r = strcmp(a->strbuf, b->strbuf);
    if (r == 0) {
	return strcmp(text + suf[a->aryidx] + STR_CASH_SIZE,
		      text + suf[b->aryidx] + STR_CASH_SIZE);
    }
    return r;
}


/****************************************************************************
 * main
 ***************************************************************************/
void main(int argc, char **argv)
{
    char in_fname[FNLEN];	/* ϥե̾ */
    char ary_fname[FNLEN];	/* ե̾ */
    size_t N;
    long pointer_cnt = 0;

    in_fname[0] = '\0';
    ary_fname[0] = '\0';

    progname = argv[0];		/* ץ̾ */

    /*================
      ץ
      ================*/
    if (argc <= 1)
	usage();

    while (argc > 1) {
	if (argv[1][0] == '-')
	  switch (argv[1][1]) {
	  case 'o':		/* ϥե̾λ */
	      if (argc == 2){	/* ʤȼդʤ */
		  fprintf(stderr,"-o <filename> --- ϥե̾\n");
		  exit(1);
	      }
	      strcpy(ary_fname,argv[2]);
	      argc--;
	      argv++;
	      break;
	  case 'l':		/* ˥ǥå */
	      option_byline = MODE_ON;
	      delimitter = (char*)malloc(20);
	      strcpy(delimitter, "\n");
	      break;
	  case 'w':		/* ˥ǥå */
	      option_byline = MODE_ON;
	      delimitter = (char*)malloc(20);
	      strcpy(delimitter, " \t\n\r\f{}.()~-`'");
	      break;
	  case 'c':		/* ʸ˥ǥåʥǥեȡ*/
	      option_byline = MODE_OFF;
	      delimitter = NULL; /* 991013 */
	      break;
	  case 'D':		/*  */
	      dict_mode = MODE_ON;
	      break;
	  case 'J':		/* ܸ⡼ */
	      j_mode = MODE_ON;
	      break;
	  case '8':		/* 2ХȰʸԤʤʤ */
	      bit_8_mode = MODE_OFF;
	      break;
	  case 'q':		/* åϤʤ */
	      quiet_mode = MODE_ON;
	      break;
	  case 'n':		/* -ns Ȥʤ⡼ */
	      if (argv[1][2] == 's')
		no_sort_mode = MODE_ON;
	      break;
	  case 's':		/* -so Ȥʤ⡼ */
	      if (argv[1][2] == 'o')
		sort_only_mode = MODE_ON;
	      break;
	  case '#':		/* #ǻϤޤԤϥȥ */
	      comment_out_mode = MODE_ON;
	      break;
	  case 'b':		/* ʬ&ޡ 990219 */
	      if (argc == 2)    /* ʤȼդʤ */
		  usage();
	      sscanf(argv[2], "%d", &number_of_block);
	      bl = (BLOCK**)malloc((number_of_block+1) * sizeof(BLOCK*));
	      bunkatu_sort_mode = MODE_ON;
	      argc--;
	      argv++;
	      break;
	  default :		/* 顼 */
	      fprintf(stderr, "%c: ̵ʥץǤ\n", argv[1][1]);
	      usage();
	  }
	else{
	    strcpy(in_fname, argv[1]); /* ƥȥե̾ */
	}
	argc--;
	argv++;
    }


    if (is_little_endian()) {
	print_msg(!quiet_mode, "Byte order is Little Endian.\n");
	little_endian_mode = MODE_ON;
    } else {
	print_msg(!quiet_mode, "Byte order is Big Endian.\n");
	little_endian_mode = MODE_OFF;
    }

    /*** ƥȥե򳫤 (mmap) ***/
    N = open_text_file(in_fname);

    /*** arrayե򳫤 ***/
    if (ary_fname[0] == '\0')
      sprintf(ary_fname, "%s.ary", in_fname);

    if (sort_only_mode != MODE_ON) {
	/* arrayե򳫤 */
	open_array_file(ary_fname);
	print_msg(!quiet_mode, "Save to \"%s\"\n", ary_fname);
    }

    /*** ݥ(arrayե) ***/
    if (sort_only_mode == MODE_ON) {
	/* arrayեϴ¸ߤΤǥȤ */
	print_msg(!quiet_mode, "Array file exists...\n");
    } else {
	print_msg(!quiet_mode, "Reading text file \"%s\"\n", in_fname);
	pointer_cnt = make_first_array_file((long)N);
	(void)fclose(ofd);
    }

    /*** arrayեΥݥ󥿤򥽡 ***/
    if (no_sort_mode == MODE_ON) {
	/* Ȥʤ⡼ */
	print_msg(!quiet_mode, "No sort.\n");
    } else {
	print_msg(!quiet_mode, "Sorting...\n");
	sort_array_file(ary_fname, pointer_cnt);
    }

    print_msg(!quiet_mode, "Done.\n");

    exit(0);			/* ｪλ */
}


/****************************************************************************
 * ƥȥե򳫤
 ***************************************************************************/
size_t open_text_file(char *fname)
{
    struct stat stat_buf;
    int fd;
    size_t N;

    if ((fd = open(fname, O_RDONLY)) < 0) { /* ϥե */
	fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
	exit(1);
    }

    (void)fstat(fd, &stat_buf);
    N = (size_t)stat_buf.st_size;

    if ((text = mmap((caddr_t)0, N, PROT_READ, MAP_SHARED, fd, 0)) ==
	(caddr_t)-1) {
	fprintf(stderr,"ERROR: text file mapping error.\n");
	exit(1);
    }
    return N;
}


/****************************************************************************
 * array ե򳫤
 ***************************************************************************/
void open_array_file(char *fname)
{
    if ((ofd = fopen(fname, "w")) == NULL) {
	fprintf(stderr, "ե \"%s\" ץޤ\n", fname);
	exit(1);
    }
}


/****************************************************************************
 * ƥȥե򥹥󤷤ơindex  array եǤФ
 ****************************************************************************/
long make_first_array_file(long size_of_text_file)
{
    long i;
    long num_of_indexies_in_array_file = 0;
    int pre_char_is_delimitter = 1;
    int pre_char_is_8th_bit_on = 0;
    int in_dict_entry_label = 1;
    
    for (i = 0; i < size_of_text_file; i++) {

	if (delimitter == NULL) {
	    /* ʸ˥ǥå */
		
	    if (dict_mode == MODE_ON) {
		/* dict_mode: '\t'  '\n' ޤǤʸ̵ for takuya-n */
		if (in_dict_entry_label) {
		    if (text[i] == '\t') {
			in_dict_entry_label = 0;
		    }
		} else {
		    if (text[i] == '\n') {
			in_dict_entry_label = 1;
			continue;
		    } else {
			continue;
		    }
		}
	    }

	    if (bit_8_mode == MODE_ON && EIGHTH_BIT_ON(text[i]) &&
		pre_char_is_8th_bit_on == 1) {
		/* 2 bytes char  2 byte ܤ̵뤹 */
		pre_char_is_8th_bit_on = 0;

	    } else {

		if (j_mode == MODE_ON && text[i] != '<' &&
		    EIGHTH_BIT_ON(text[i])){
		    /* j_mode:  '<' ʳ̵ */
		    /**/
		} else {
		    fwrite(&i, 1, sizeof(long), ofd);
		    num_of_indexies_in_array_file++;
		    if (EIGHTH_BIT_ON(text[i]))
		      pre_char_is_8th_bit_on = 1;
		}
	    }

	} else if ((char*)strchr(delimitter, text[i]) != NULL &&
		   text[i] != '\0') {

	    pre_char_is_delimitter = 1;

	} else if (pre_char_is_delimitter == 1) {
	    /* ʸڤʸʤ */

	    if (comment_out_mode && text[i] == '#') {
		/* comment_out_mode: # ϤޤԤ̵ */
		/**/
	    } else {
		fwrite(&i, 1, sizeof(long), ofd);
		num_of_indexies_in_array_file++;
	    }
	    pre_char_is_delimitter = 0;
	}

	if (quiet_mode == MODE_ON || i == 0)
	  continue;
	if (!(i % 50000))
	  print_msg(!quiet_mode, "+");
	if (!(i % 1000000))
	  print_msg(!quiet_mode, " %ldM\n", i/1000000);
    }
    
    print_msg(!quiet_mode, "\n");

    return num_of_indexies_in_array_file;
}


/****************************************************************************
 * sort a huge file by "divide and merge" (called from sort_array_file() )
 ***************************************************************************/
void divide_and_merge(long* suf, char *merged_ary_fname, long pointer_cnt)
{
    int i;
    long bl_size = pointer_cnt / number_of_block;
    FILE* mfd;

    if ((mfd = fopen(merged_ary_fname,"w")) == NULL) {
	fprintf(stderr,"ե \"%s\" ץޤ\n",
		merged_ary_fname);
	exit(1);
    }


    for (i = 1; i <= number_of_block; i++) {
	BLOCK* tmp = (BLOCK*)malloc(sizeof(BLOCK));
	long size;

	print_msg(!quiet_mode, " BLOCK %d\n", i);

	tmp->aryidx = (i - 1) * bl_size;

	if (i == number_of_block) {
	    size = pointer_cnt - bl_size * (i - 1);
	} else {
	    size = bl_size;
	}

	tmp->max = tmp->aryidx + size;
	/*printf("from=%ld to=%ld size=%ld\n", tmp->aryidx, tmp->max, size);*/
	qsort(suf + tmp->aryidx, (size_t)size, sizeof(long),
	      (int (*)(const void *,const void *))suffix_compare);

	strncpy(tmp->strbuf, text + suf[tmp->aryidx], STR_CASH_SIZE);
	tmp->strbuf[STR_CASH_SIZE] = '\0';

	insertheap(bl, tmp);
    }

    /*
       for(i = 1; i <= number_of_block; i++)
       printf("heap[%d] %ld %ld \"%x\"\n",
       i, bl[i]->aryidx, bl[i]->max, bl[i]->strbuf[0]);
       */
	
    
    print_msg(!quiet_mode, "Merging into \"%s\"\n", merged_ary_fname);
    
    for (i = 0; i < pointer_cnt; i++) {
	BLOCK* tmp = bl[1];
	long idx_to_output;
	idx_to_output = *(suf + tmp->aryidx);

	if (little_endian_mode) {
	    reverse_byte_order(&idx_to_output, 1);
	}
	fwrite(&idx_to_output, 1, sizeof(long), mfd);

	(tmp->aryidx)++;
	if (tmp->aryidx >= tmp->max) {
	    hremove(bl);
	    /*printf("remove %ld\n", tmp->max);*/
	    continue;
	}

	strncpy(tmp->strbuf, text + suf[tmp->aryidx], STR_CASH_SIZE);
	tmp->strbuf[STR_CASH_SIZE] = '\0';

	downheap(bl, 1);
    }
    
    fclose(mfd);
}


/****************************************************************************
  ݥ󥿤򥽡
  ***************************************************************************/
void sort_array_file(char *ary_fname, long pointer_cnt)
{
    struct stat om_stat_buf;
    char *outmap;
    int omfd;
    size_t omsize;

    if ((omfd = open(ary_fname, O_RDWR)) < 0) { /* ϥե */
	fprintf(stderr,"ե \"%s\" ץޤ\n", ary_fname);
	exit(1);
    }

    (void)fstat(omfd, &om_stat_buf);
    omsize = (size_t)om_stat_buf.st_size;

    if (sort_only_mode == MODE_ON)
      pointer_cnt = omsize / sizeof(long);

    if ((outmap = mmap((caddr_t)0, omsize, PROT_READ | PROT_WRITE, MAP_SHARED,
		       omfd, 0)) == (caddr_t)-1) {
	fprintf(stderr,"ERROR: array file mapping error.\n");
	exit(1);
    }

    suf = (long *)outmap;

    if (bunkatu_sort_mode == MODE_OFF) {
	/* ̤˥ */
	qsort(suf, (size_t)pointer_cnt, sizeof(long),
	      (int (*)(const void *, const void *))suffix_compare);

	if (little_endian_mode) {
	    reverse_byte_order(suf, (size_t)pointer_cnt);
	}
	
	close(omfd);

    } else {
	/* ʬޡˤ륽 */
	char merged_ary_fname[strlen(ary_fname) + 2];
	sprintf(merged_ary_fname, "%s-", ary_fname);

	divide_and_merge(suf, merged_ary_fname, pointer_cnt);
	
	close(omfd);

	print_msg(!quiet_mode, "Rename \"%s\" with \"%s\"\n",
		  merged_ary_fname, ary_fname);
	rename(merged_ary_fname, ary_fname);
    }

}


/****************************************************************************
 * print "usage"
 ***************************************************************************/
void usage(void){
    fprintf(stderr, "\
mkary - make index file of text file v1.8\n\n\
Usage:\n\
  mkary [ -l [-#] ] [ -w ] [ -c ] [ -q ] [ -ns ] [ -so ] [ -8 ]\n\
        [ -J ] [ -m ] [ -b NUM ] [ -o file_name ] text_file_name\n\n\
Options:\n\
   -o file_name  : ϥե( ɸ text_file_name.ary )\n\
   -l            : ñ̤Ǻ ( \\nǶڤ )\n\
   -w            : ññ̤Ǻ ( ' ',\\t,\\n ʤɤǶڤ )\n\
   -c            : ʸñ̤Ǻ ( ɸ )\n\
   -q            : åʤ\n\
   -ns           : Ȥʤ(No Sort)\n\
   -so           : Ȥ(Sort Only)\n\
   -8            : 2ХȰʸԤʤʤ\n\
   -J            : ܸʸ '<' ʳ̵뤹(ʸñ̤ΤȤ)\n\
   -#            : #ǻϤޤԤϥȥ(ñ̤ΤȤ)\n\
   -b NUM        : ʬ䤷ƥȤԤǸ˥ޡ\n\
                   NUM ʬꡣ­ΤȤˤɬ\n\n"
  );
    exit(1);
}


/***************************************************************************
 * ҡ״Ϣ 990219
 * ʸ: R. Sedgewick  ʿ顢ƣϡĸ , 
 * "르ꥺ (Algorithms) 2 1 á", ʳؼ
 ***************************************************************************/
void upheap(BLOCK** a, int k)
{
    BLOCK *v;
    v = a[k];
    while (k > 1 &&
	   suffix_compare(suf + a[k / 2]->aryidx, suf + v->aryidx) > 0) {
	a[k] = a[k / 2];
	k = k / 2;
    }
    a[k] = v;
}

void insertheap(BLOCK** a, BLOCK* v)
{
    size_of_heap++;
    a[size_of_heap] = v;
    upheap(a, size_of_heap);
}

void downheap(BLOCK** a, int k)
{
    int j;
    BLOCK *v;
    v = a[k];
    while (k <= size_of_heap / 2) {
	j = k * 2;
	if (j < size_of_heap && two_level_compare(a[j], a[j + 1]) >= 0)
 	    j++;
	if (two_level_compare(v, a[j]) <= 0)
	    break;
	a[k] = a[j];
	k = j;
    }
    a[k] = v;
}

BLOCK* hremove(BLOCK** a)
{
    BLOCK* r = a[1];
    a[1] = a[size_of_heap];
    size_of_heap--;
    downheap(a, 1);
    return r;
}
