/*
 * suffix array ץ
 * ʸ: Kenneth W. Church, NLPRS '95  Invited Lecture
 */
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>

#define FNLEN 1000 /* ե̾Ĺ */
#define MODE_ON 1
#define MODE_OFF 0
#define STR_CASH_SIZE 20 /* 990220 */


/* 990219 Heap */
typedef struct {
  long aryidx;
  long max;
  long size;
  char strbuf[STR_CASH_SIZE];
} BLOCK;

void open_array_file(char *ary_fname);
size_t open_text_file(char *fname);
void sort_array_file(char *ary_fname, long pointer_cnt);
long make_first_array_file(long);
void usage(void);
void insertheap(BLOCK** a, BLOCK* v); /* 990219 */
void upheap(BLOCK** a, int k); /* 990219 */
void downheap(BLOCK** a, int k); /* 990219 */
BLOCK* hremove(BLOCK** a); /* 990219 */

/*=============
   ѿ
=============*/
BLOCK **bl; /* 990219 */
int size_of_heap = 0; /* 990219 */

char *text; /* оݤȤʤƥ */
long *suf; /* pointer(index)  */

int option_byline = MODE_OFF; /* ʸ˥ǥå */
char delimitter[20]; /* ԡñζڤ국 */
char *progname; /* program name */
int quiet_mode = MODE_OFF; /* åϤʤŤ⡼ɡ */
int comment_out_mode = MODE_OFF; /* #ǻϤޤԤϥȥ */
int no_sort_mode = MODE_OFF; /* Ȥʤ⡼ */
int sort_only_mode = MODE_OFF; /* Ȥʤ⡼ */
int bit_8_mode = MODE_ON; /* 2ХȰʸ⡼ */
int dict_mode = MODE_OFF;
int j_mode = MODE_OFF; /* ܸ'<'ˤǥåĥ⡼ 981115 */
int bunkatu_sort_mode = MODE_OFF; /* 990219 */

int number_of_block; /* 990219 ʬ֥å */

FILE *ofd = NULL; /* ե */

int ctr_1 = 0, ctr_2 = 0; /* ҡפΥǥХ */


/* 990616 for My Sort */
void swap(long *v, size_t i, size_t j){
  long temp;
  temp = v[i]; v[i] = v[j]; v[j] = temp;
}

/* 990616 for My Sort */
int sufcmp(long a, long b, long depth){
  return strcmp(text + a + depth, text + b + depth);
}

/* 990616 for My Sort */
void rqsort(long *v, size_t n, long depth)
{
  int i, last;
  if(n <= 1) return;
  swap(v, 0, rand() % n);
  last = 0;
  for(i = 1; i < n; i++) if(sufcmp(v[i], v[0], depth) < 0) swap(v, ++last, i);
  swap(v, 0, last);  
  rqsort(v, last, depth);
  rqsort(v+last+1, n-last-1, depth);
}



int suffix_compare(long *a, long *b)
{
  return strcmp(text + *a, text + *b);
/*
  char *s1 = text + *a, *s2 = text + *b;
  while(*s1 == *s2){
    s1++; s2++;
  }
  return (unsigned char)*s1-(unsigned char)*s2;
*/
}


int two_level_compare(BLOCK *a, BLOCK *b) /* since 990220 */
{
  int r = strcmp(a->strbuf, b->strbuf);
  if(r == 0){
    ctr_1++;
    return strcmp(text + suf[a->aryidx] + STR_CASH_SIZE,
		  text + suf[b->aryidx] + STR_CASH_SIZE);
  }
  ctr_2++;
  return r;
}


void main(int argc, char **argv)
{
  char in_fname[FNLEN]; /* ϥե̾ */
  char ary_fname[FNLEN]; /* ե̾ */
  size_t N;
  long pointer_cnt = 0;

  in_fname[0] = '\0';
  ary_fname[0] = '\0';

  progname = argv[0]; /* ץ̾ */

  /*================
    ץ
  ================*/
  if(argc <= 1){
    usage();
    exit(1);
  }
  while (argc > 1){
    if (argv[1][0] == '-')
      switch (argv[1][1]){
      case 'o': /* ϥե̾λ */
        if (argc == 2){ /* ʤȼդʤ */
          fprintf(stderr,"-o <filename> --- ϥե̾\n");
	  exit(1);
	}
	strcpy(ary_fname,argv[2]);
        argc--; argv++;
        break;
      case 'l': /* ˥ǥå */
        option_byline = MODE_ON;
        strcpy(delimitter,"\n");
        break;
      case 'w': /* ˥ǥå */
        option_byline = MODE_ON;
        strcpy(delimitter," \t\n\r\f{}.()~-`'");
        break;
      case 'c': /* ʸ˥ǥåʥǥեȡ */
        option_byline = MODE_OFF;
        break;
      case 'D': /*  */
	dict_mode = MODE_ON;
        break;
      case 'J': /* ܸ⡼ */
	j_mode = MODE_ON;
        break;
      case '8': /* 2ХȰʸԤʤʤ */
        bit_8_mode = MODE_OFF;
        break;
      case 'q': /* åϤʤ */
        quiet_mode = MODE_ON;
        break;
      case 'n': /* -ns Ȥʤ⡼ */
	if(argv[1][2] == 's') no_sort_mode = MODE_ON;
        break;
      case 's': /* -so Ȥʤ⡼ */
	if(argv[1][2] == 'o') sort_only_mode = MODE_ON;
        break;
      case '#': /* #ǻϤޤԤϥȥ */
        comment_out_mode = MODE_ON;
        break;
      case 'b': /* ʬ&ޡ 990219 */
        if (argc == 2){ /* ʤȼդʤ */
	  usage(); exit(1);
	}
	sscanf(argv[2],"%d",&number_of_block);
	bl = (BLOCK**)malloc((number_of_block+1) * sizeof(BLOCK*));
	bunkatu_sort_mode = MODE_ON;
        argc--; argv++;
	break;
      default : /* 顼 */
        fprintf(stderr, "%c: ̵ʥץǤ\n", argv[1][1]);
	usage();
        exit(1);
      }
    else{
      strcpy(in_fname, argv[1]); /* ƥȥե̾ */
    }
    argc--; argv++;
  }


  /*** ƥȥե򳫤 (mmap) ***/
  N = open_text_file(in_fname);

  /*** arrayե򳫤 ***/
  if (ary_fname[0] == '\0') sprintf(ary_fname,"%s.ary",in_fname);
  if(sort_only_mode != MODE_ON) open_array_file(ary_fname);

  /*** ݥ(arrayե) ***/
  if(sort_only_mode == MODE_ON){
    /* arrayեϴ¸ߤΤǥȤ */
    if(quiet_mode == MODE_OFF) fprintf(stderr,"Array file exists...\n");
  } else {
    if(quiet_mode == MODE_OFF)
      fprintf(stderr, "Reading text file \"%s\"\n", in_fname);
    pointer_cnt = make_first_array_file((long)N);
    (void)fclose(ofd);
  }

  /*** arrayեΥݥ󥿤򥽡 ***/
  if(no_sort_mode == MODE_ON){ /* Ȥʤ⡼ */
    if(quiet_mode == MODE_OFF) fprintf(stderr,"No sort.\n");
  } else sort_array_file(ary_fname, pointer_cnt);

  if(quiet_mode == MODE_OFF) fprintf (stderr,"Done.\n");

  /*printf("%d %d\n",ctr_1,ctr_2);*/

  exit(0); /* ｪλ */
}


/******************************************************************************
  ƥȥե򳫤
 *****************************************************************************/
size_t open_text_file(char *fname)
{
  struct stat stat_buf;
  int fd;
  size_t N;

  if ((fd = open(fname, O_RDONLY)) < 0){ /* ϥե */
    fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
    exit(1);
  }

  (void)fstat(fd, &stat_buf);
  N = (size_t)stat_buf.st_size;

  if((text = mmap((caddr_t)0, N, PROT_READ, MAP_SHARED, fd, 0))
     == (caddr_t)-1){
    fprintf(stderr,"ERROR: text file mapping error.\n");
    exit(1);
  }
  return N;
}


/******************************************************************************
  arrayե򳫤
 *****************************************************************************/
void open_array_file(char *fname)
{
  if((ofd = fopen(fname,"w")) == NULL){
    fprintf(stderr,"ե \"%s\" ץޤ\n", fname);
    exit(1);
  }
  if(quiet_mode == MODE_OFF) fprintf(stderr,"Save to \"%s\"\n",fname);
}


/******************************************************************************
 *   long make_first_array_file(long N);
 *
 * purpose
 *   ƥȥե򥹥󤷤ơݥ󥿤 arrayեǤФ
 *
 * parameters
 *   N : ƥȥեΥ
 *
 * return value
 *   arrayեΥ
 *
 * description
 *   ݥ󥿤򿶤ؿ(㤨СʸȤƬʤ)ϡ
 *   option_byline, delimitter ʤɤѿǷ롣
 *****************************************************************************/
long make_first_array_file(long N)
{
  long i, jj = 0;
  int last_char_is_delimitter = 1;
  int last_char_is_kanji = 0;
  int dic_ent = 1;

  /* printf("IN   N = %d\n",N);fflush(stdout);*/

  if (dict_mode == MODE_ON) {
      for(i = 0; i < N; i++){
	  /*printf("i  %d  %d\n",i,last_char_is_kanji);fflush(stdout);*/
	  if (dic_ent) {
	      if (!last_char_is_kanji && text[i] == '\t') dic_ent = 0;
	  } else {
	      if (!last_char_is_kanji && text[i] == '\n') {
		  dic_ent = 1;
		  continue;
	      }
	      else continue;
	  }
	  /* EUC2char */
	  if(bit_8_mode == MODE_ON
	     && last_char_is_kanji == 1
	     && (0x80 & text[i])) {
	      last_char_is_kanji = 0;
	  } else {
	      fwrite(&i, 1, sizeof(long), ofd);
	      jj++;
	      if((0x80 & text[i])) last_char_is_kanji = 1;
	  }
	  if(quiet_mode == MODE_ON || i == 0) continue;
	  if(!(i % 50000)) fprintf(stderr,"+");
	  if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
      }
  } else if(option_byline == MODE_ON){ /* ԡ˥ǥå */
    for(i = 0; i < N; i++){
      if((char*)strchr(delimitter, text[i]) != NULL && text[i] != '\0')
	last_char_is_delimitter = 1;
      else if(last_char_is_delimitter == 1){ /* ʸڤʸʤ */
	if(comment_out_mode && text[i] == '#'){
	  /* #ϤޤԤ̵  980319 */
	} else {
	  fwrite(&i, 1, sizeof(long), ofd);
	  /*	    write(fd, &i, sizeof(long));*/
	  jj++;
	}
	last_char_is_delimitter = 0;
      }
      if(quiet_mode == MODE_ON || i == 0) continue;
      if(!(i % 50000)) fprintf(stderr,"+");
      if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
    }
  } else {                    /* ʸ˥ǥå */
    for(i = 0; i < N; i++){
      /*printf("i  %d  %d\n",i,last_char_is_kanji);fflush(stdout);*/
      /* EUC2char */
      if(bit_8_mode == MODE_ON
	 && (0x80 & text[i]) != 0x00
	 && last_char_is_kanji == 1){
	last_char_is_kanji = 0;
      } else {
	if(j_mode == MODE_ON && text[i] != '<' && ((0x80 & text[i]) == 0x00)){
	} else {
	  fwrite(&i, 1, sizeof(long), ofd);
	  jj++;
	  if((0x80 & text[i]) != 0x00) last_char_is_kanji = 1;
	}
      }
      if(quiet_mode == MODE_ON || i == 0) continue;
      if(!(i % 50000)) fprintf(stderr,"+");
      if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000);
    }
  }
  /* printf("OUT\n");fflush(stdout);*/

  if(quiet_mode == MODE_OFF) fprintf(stderr,"\n");
  return(jj);
}


/******************************************************************************
  ݥ󥿤򥽡
 *****************************************************************************/
void sort_array_file(char *ary_fname, long pointer_cnt)
{
  struct stat om_stat_buf;
  char *outmap;
  int omfd;
  size_t omsize;

  if ((omfd = open(ary_fname, O_RDWR)) < 0){  /* ϥե */
    fprintf(stderr,"ե \"%s\" ץޤ\n", ary_fname);
    exit(1);
  }

  (void)fstat(omfd, &om_stat_buf);
  omsize = (size_t)om_stat_buf.st_size;
  if(sort_only_mode == MODE_ON) pointer_cnt = omsize / sizeof(long);
  /* omsize = pointer_cnt * sizeof(int); ǤOK */
  /* printf("%ld %ld\n",pointer_cnt*sizeof(long),omsize);*/
  if((outmap = mmap((caddr_t)0, omsize, PROT_READ | PROT_WRITE, MAP_SHARED, omfd, 0)) == (caddr_t)-1){
    fprintf(stderr,"ERROR: array file mapping error.\n");
    exit(1);
  }
  suf = (long *)outmap;

  if(quiet_mode == MODE_OFF) fprintf (stderr,"Sorting...\n");

  if(bunkatu_sort_mode == MODE_OFF){ /* ̤˥ */
    /*qsort(suf, (size_t)pointer_cnt, sizeof(long),
	  (int (*)(const void *,const void *))suffix_compare);
	  */
    rqsort((void*)suf, (size_t)pointer_cnt, 0);

    if(quiet_mode == MODE_OFF) fprintf (stderr,"Saving...\n");
    close(omfd);
  } else { /* ʬޡ: bunkatu_sort_mode == MODE_ON 990219 */
    int i;
    long bl_size = pointer_cnt / number_of_block;
    FILE* mfd;
    char merged_ary_fname[strlen(ary_fname)+2];
    sprintf(merged_ary_fname, "%s-", ary_fname);

    /* if(pointer_cnt % number_of_block) bl_size++;*/
    /*printf("bl_size = %ld, pointer_cnt = %ld\n", bl_size, pointer_cnt);*/
    for(i = 1; i <= number_of_block; i++){
      BLOCK* tmp = (BLOCK*)malloc(sizeof(BLOCK));
      long size;
 
      if(quiet_mode == MODE_OFF) fprintf(stderr, " BLOCK %d\n", i);

      tmp->aryidx = (i-1)*bl_size;
      if(i == number_of_block) size = pointer_cnt - bl_size * (i-1);
      else size = bl_size;
      tmp->max = tmp->aryidx + size;
      /*printf("from=%ld to=%ld size=%ld\n", tmp->aryidx, tmp->max, size);*/
      qsort(suf+tmp->aryidx, (size_t)size, sizeof(long),
	    (int (*)(const void *,const void *))suffix_compare);
      strncpy(tmp->strbuf, text+suf[tmp->aryidx], STR_CASH_SIZE);
      tmp->strbuf[STR_CASH_SIZE] = '\0';
      insertheap(bl, tmp);
    }

    /*
    for(i = 1; i <= number_of_block; i++)
      printf("heap[%d] %ld %ld \"%x\"\n",
	     i, bl[i]->aryidx, bl[i]->max, bl[i]->strbuf[0]);
    */
    if((mfd = fopen(merged_ary_fname,"w")) == NULL){
      fprintf(stderr,"ե \"%s\" ץޤ\n",
	      merged_ary_fname);
      exit(1);
    }

    if(quiet_mode == MODE_OFF)
      fprintf(stderr, "Merging into \"%s\"\n", merged_ary_fname);
    for(i = 0; i < pointer_cnt; i++){
      BLOCK* tmp = bl[1];
      /*printf("tmp->aryidx=%ld tmp->strbuf='%s'\n", tmp->aryidx, tmp->strbuf);
      printf("suf[tmp->aryidx] = %ld\n", suf[tmp->aryidx]);*/
      fwrite(suf+tmp->aryidx, 1, sizeof(long), mfd);
      (tmp->aryidx)++;
      if(tmp->aryidx >= tmp->max){
	hremove(bl);
	/*printf("remove %ld\n", tmp->max);*/
	continue;
      }
      strncpy(tmp->strbuf, text+suf[tmp->aryidx], STR_CASH_SIZE);
      tmp->strbuf[STR_CASH_SIZE] = '\0';
      downheap(bl, 1);
    }

    close(omfd);
    fclose(mfd);
    if(quiet_mode == MODE_OFF)
      fprintf(stderr,"Rename \"%s\" with \"%s\"\n",merged_ary_fname,ary_fname);
    rename(merged_ary_fname, ary_fname);

  }
}


/*
   usage --- Ȥ
*/
void usage(void){
  fprintf(stderr, "\n"
	  "mkary --- array ե\n\n"
	  "Version 1.7 990616\n\n"
	  "USAGE\n"
	  "  mkary [ -l [-#] ] [ -w ] [ -c ] [ -q ] [ -ns ] [ -so ] [ -8 ]\n"
	  "        [ -J ] [ -m ] [ -b NUM ] [ -o FILE_NAME ] FILE_NAME\n"
	  "\n"
	  "OPTION\n"
	  "  -o FILE_NAME  : ϥե( default  FILE_NAME.ary )\n"
	  "  -l            : ñ̤Ǻ ( \"\\n\" Ƕڤ )\n"
	  "  -w            : ññ̤Ǻ ( \" \",\"\\t\",\"\\n\" Ƕڤ )\n"
	  "  -c            : ʸñ̤Ǻ ( default )\n"
	  "  -q            : åʤ\n"
	  "  -ns           : Ȥʤ(No Sort)\n"
	  "  -so           : Ȥ(Sort Only)\n"
	  "  -8            : 2ХȰʸԤʤʤ\n"
	  "  -J            : ܸʸ '<' ʳ̵뤹(ʸñ̤ΤȤ)\n"
          "  -#            : #ǻϤޤԤϥȥ(ñ̤ΤȤ)\n"
          "  -b NUM        : ʬ䤷ƥȤԤǸ˥ޡ\n"
          "                  NUM ʬꡣ­ΤȤˤɬ\n"
	  "\n"
	  );
}


/******************************************************************************
  ҡ״Ϣ 990219
  ʸ: R. Sedgewick  ʿ顢ƣϡĸ , 
        "르ꥺ (Algorithms) 2 1 á", ʳؼ
 *****************************************************************************/
void upheap(BLOCK** a, int k){
  BLOCK *v;
  v = a[k];
  while( k > 1 && suffix_compare(suf+a[k/2]->aryidx ,suf+v->aryidx) > 0) {
    a[k] = a[k/2]; k = k / 2;
  }
  a[k] = v;
}

void insertheap(BLOCK** a, BLOCK* v){
  size_of_heap++; a[size_of_heap] = v;
  upheap(a, size_of_heap);
}

void downheap(BLOCK** a, int k){
  int j;
  BLOCK *v;
  v = a[k];
  while(k <= size_of_heap / 2){
    j = k*2;
/*    if(j < size_of_heap &&
       suffix_compare(suf+a[j]->aryidx, suf+a[j+1]->aryidx) >= 0) j++;
    if(suffix_compare(suf+v->aryidx, suf+a[j]->aryidx) <= 0) goto DOWNHEAP_0;
*/
    if(j < size_of_heap && two_level_compare(a[j], a[j+1]) >= 0) j++;
    if(two_level_compare(v, a[j]) <= 0) goto DOWNHEAP_0;
    a[k] = a[j]; k = j;
  }
 DOWNHEAP_0:
  a[k] = v;
}

BLOCK* hremove(BLOCK** a){
  BLOCK* r = a[1];
  a[1] = a[size_of_heap]; size_of_heap--;
  downheap(a, 1);
  return r;
}
