/*
	Copyright (C) 1995-2007	Edward Der-Hua Liu, Hsin-Chu, Taiwan
*/

#include "gcin.h"
#include "pho.h"
#include "tsin.h"


int hashidx[TSIN_HASH_N];
int *phidx;
FILE *fph;
int phcount;
char tsidxfname[64]="";

static int a_phcount;
static char tsfname[64]="";

#if USE_TSIN
void load_tsin_db()
{
  if (!tsfname[0]) {
    if (!getenv("GCIN_TABLE_DIR"))
      get_gcin_user_fname("tsin32", tsfname);
    else
      get_sys_table_file_name("tsin32", tsfname);
  }


  strcpy(tsidxfname, tsfname);
  strcat(tsidxfname, ".idx");

  FILE *fr;

  if ((fr=fopen(tsidxfname,"r"))==NULL) {
    p_err("Cannot open %s\n", tsidxfname);
  }

  fread(&phcount,4,1,fr);
#if     0
  printf("phcount:%d\n",phcount);
#endif
  a_phcount=phcount+256;
  fread(&hashidx,1,sizeof(hashidx),fr);

  if (phidx)
    free(phidx);

  if ((phidx=tmalloc(int, a_phcount))==NULL)
    p_err("malloc err pp 1");

  fread(phidx,4, phcount, fr);
  fclose(fr);

  if (fph)
    fclose(fph);

  dbg("tsfname: %s\n", tsfname);

  if ((fph=fopen(tsfname,"r+"))==NULL)
    p_err("Cannot open %s", tsfname);
}
#endif


#if USE_TSIN
void free_tsin()
{
  if (fph) {
    fclose(fph); fph = NULL;
  }

  if (phidx) {
    free(phidx); phidx = NULL;
  }
}
#endif


static int phseq(u_char *a, u_char *b)
{
  u_char lena, lenb, mlen;
  int i;
  phokey_t ka,kb;

  lena=*(a++); lenb=*(b++);
  a+=sizeof(usecount_t); b+=sizeof(usecount_t);   // skip usecount

  mlen=Min(lena,lenb);

  for(i=0;i<mlen; i++) {
    memcpy(&ka, a, sizeof(phokey_t));
    memcpy(&kb,b, sizeof(phokey_t));
    if (ka > kb) return 1;
    if (ka < kb) return -1;
    a+=sizeof(phokey_t);
    b+=sizeof(phokey_t);
  }

  if (lena > lenb) return 1;
  if (lena < lenb) return -1;
  return 0;
}

void inc_dec_tsin_use_count(phokey_t *pho, char *ch, int N, gboolean b_dec);


gboolean save_phrase_to_db(phokey_t *phkeys, char *utf8str, int len, usecount_t usecount)
{
  int mid, ord = 0, ph_ofs, hashno, i;
  FILE *fw;
  u_char tbuf[MAX_PHRASE_LEN*(sizeof(phokey_t)+CH_SZ) + 1 + sizeof(usecount_t)],
         sbuf[MAX_PHRASE_LEN*(sizeof(phokey_t)+CH_SZ) + 1 + sizeof(usecount_t)];

  tbuf[0]=len;
  memcpy(&tbuf[1], &usecount, sizeof(usecount));  // usecount
  int tlen = utf8_tlen(utf8str, len);
#if 0
  dbg("tlen %d  '", tlen);
  for(i=0; i < tlen; i++)
    putchar(utf8str[i]);
  dbg("'\n");
#endif

  memcpy(&tbuf[1 + sizeof(usecount_t)], phkeys, sizeof(phokey_t) * len);
  memcpy(&tbuf[sizeof(phokey_t)*len + 1 + sizeof(usecount_t)], utf8str, tlen);

  hashno=phkeys[0] >> TSIN_HASH_SHIFT;
  if (hashno >= TSIN_HASH_N)
    return FALSE;

  for(mid=hashidx[hashno]; mid<hashidx[hashno+1]; mid++) {
    ph_ofs=phidx[mid];
    fseek(fph, ph_ofs, SEEK_SET);
    fread(sbuf,1,1,fph);
    fread(&sbuf[1], sizeof(usecount_t), 1, fph); // use count
    fread(&sbuf[1+sizeof(usecount_t)], 1, (sizeof(phokey_t) + CH_SZ) * sbuf[0], fph);
    if ((ord=phseq(sbuf,tbuf)) >=0)
      break;
  }

//  dbg("tlen:%d  ord:%d  %s\n", tlen, ord, utf8str);
  if (!ord && !memcmp(&sbuf[sbuf[0]*sizeof(phokey_t)+1+sizeof(usecount_t)], utf8str, tlen)) {
//    bell();
    dbg("Phrase already exists\n");
    inc_dec_tsin_use_count(phkeys, utf8str, len, FALSE);
    return FALSE;
  }

  for(i=phcount;i>=mid;i--)
    phidx[i+1]=phidx[i];

  fseek(fph,0,SEEK_END);
  ph_ofs=ftell(fph);
  phidx[mid]=ph_ofs;
  phcount++;
  if (phcount>=a_phcount) {
    a_phcount+=256;
    if (!(phidx=trealloc(phidx, int, a_phcount*4))) {
      p_err("tsin.c:realloc err");
    }
  }

  fwrite(tbuf, 1, sizeof(phokey_t)*len + tlen + 1+ sizeof(usecount_t), fph);
  fflush(fph);

  if (hashidx[hashno]>mid)
    hashidx[hashno]=mid;

  hashno++;

  for(;hashno<256;hashno++)
    hashidx[hashno]++;

  if ((fw=fopen(tsidxfname,"w"))==NULL) {
    dbg("%s create err", tsidxfname);
    return FALSE;
  }

  fwrite(&phcount,4,1,fw);
  fwrite(&hashidx,sizeof(hashidx),1,fw);
  fwrite(phidx,4,phcount,fw);
  fclose(fw);

  return TRUE;
}


int *ts_gtab;
extern int ts_gtabN;

int read_tsin_phrase(char *str)
{
  u_char len;
  usecount_t usecount;
  u_char pho[sizeof(phokey_t) * MAX_PHRASE_LEN];
  len = 0;

  fread(&len, 1, 1, fph);
  if (len > MAX_PHRASE_LEN || len <=0)
    return 0;
  fread(&usecount, sizeof(usecount_t), 1, fph); // use count
  fread(pho, sizeof(phokey_t), len, fph);

  int i;
  int tlen = 0;

  for(i=0; i < len; i++) {
    fread(&str[tlen], 1, 1, fph);
    int sz = utf8_sz(&str[tlen]);
    fread(&str[tlen+1], 1, sz-1, fph);
    tlen+=sz;
  }

  str[tlen] = 0;

  return tlen;
}

typedef struct {
  char ts[MAX_PHRASE_STR_LEN];
  int ofs;
} TS_TMP;

static int qcmp_ts_gtab(const void *aa, const void *bb)
{
  TS_TMP *a = (TS_TMP *)aa, *b = (TS_TMP *)bb;

  return strcmp(a->ts, b->ts);
}


#if USE_TSIN
void build_ts_gtab()
{
  load_tsin_db();

  fseek(fph,0,SEEK_SET);

  if (ts_gtab) {
    free(ts_gtab);
    ts_gtab = NULL;
  }

  TS_TMP *tstmp=NULL;
  int tstmpN=0;

  while (!feof(fph)) {
    if (!(tstmp=trealloc(tstmp, TS_TMP, tstmpN + 1)))
      p_err("tsin.c:realloc err");

    tstmp[tstmpN].ofs = ftell(fph);

    if (!read_tsin_phrase(tstmp[tstmpN].ts))
      break;

    tstmpN++;
  }

  qsort(tstmp, tstmpN, sizeof(TS_TMP), qcmp_ts_gtab);

  ts_gtabN = tstmpN;
  ts_gtab = tmalloc(int, ts_gtabN);

  int i;
  for(i=0; i < tstmpN; i++) {
    ts_gtab[i] = tstmp[i].ofs;
  }

  free(tstmp);
}
#endif


static int load_ts_gtab(int idx, char *tstr)
{
  int ofs = ts_gtab[idx];

  fseek(fph, ofs, SEEK_SET);
  return read_tsin_phrase(tstr);
}

#if USE_TSIN
// len is in CH_SZ
int find_match(char *str, int len, char *match_chars, int match_chars_max)
{
  if (!len)
    return 0;

  if (!ts_gtabN)
    build_ts_gtab();

  int bottom = 0;
  int top = ts_gtabN - 1;
  int mid, tlen;
  char tstr[MAX_PHRASE_STR_LEN];
  int matchN=0;

  if (match_chars)
      match_chars[0] = 0;

  do {
    mid = (bottom + top) /2;

//    dbg("tstr:%s  %d %d %d\n", tstr, bottom, mid, top);
    tlen = load_ts_gtab(mid, tstr);

    if (!tlen) {  // error in db
      dbg("error in db\n");
      build_ts_gtab();
      return 0;
    }

    int r = strncmp(str, tstr, len);

    if (r < 0) {
      top = mid - 1;
    }
    else
    if (r > 0 || strlen(tstr)==len) {
      bottom = mid + 1;
    } else {
      strcpy(str, tstr);

      if (!match_chars)
        return 1;

      bottom = mid;
      int i;

      int totlen=0;
      for(i=mid; i>=0; i--) {
        tlen = load_ts_gtab(i, tstr);

        if (strncmp(str, tstr, len) || tlen <= len)
          break;

        if (matchN >= match_chars_max)
          break;

        int slen= u8cpy(&match_chars[totlen], &tstr[len]);
        totlen+=slen;
        matchN++;
      }

      for(i=mid+1; i< ts_gtabN; i++) {
        tlen = load_ts_gtab(i, tstr);

        if (strncmp(str, tstr, len) || tlen <= len)
          break;

        if (matchN >= match_chars_max)
          break;

        int slen = u8cpy(&match_chars[totlen], &tstr[len]);
        totlen+=slen;
        matchN++;
      }

      match_chars[totlen] = 0;

      return matchN;
    }

  } while (bottom <= top);

//  dbg("%d %d\n", bottom, top);
  return 0;
}
#endif


void load_tsin_entry(int idx, char *len, usecount_t *usecount, phokey_t *pho,
                    u_char *ch)
{
  *usecount = 0;

  if (idx >= phcount) {
    load_tsin_db(); // probably db changed, reload;
    *len = 0;
    return;
  }

  int ph_ofs=phidx[idx];

  fseek(fph, ph_ofs, SEEK_SET);
  fread(len, 1, 1, fph);

  if (*len > MAX_PHRASE_LEN || *len <= 0) {
    dbg("err: tsin db changed reload");
    load_tsin_db(); // probably db changed, reload;
    *len = 0;
    return;
  }

  fread(usecount, sizeof(usecount_t), 1, fph); // use count
  fread(pho, sizeof(phokey_t), (int)(*len), fph);
  if (ch)
    fread(ch, CH_SZ, (int)(*len), fph);
}


int phokey_t_seq(phokey_t *a, phokey_t *b, int len)
{
  int i;

  for (i=0;i<len;i++) {
    if (a[i] > b[i]) return 1;
    else
    if (a[i] < b[i]) return -1;
  }

  return 0;
}

// ***  r_sti<=  range  < r_edi
gboolean tsin_seek(phokey_t *pho, int plen, int *r_sti, int *r_edi)
{
  int mid, cmp;
  phokey_t ss[MAX_PHRASE_LEN], stk[MAX_PHRASE_LEN];
  u_char mlen, stch[MAX_PHRASE_LEN * CH_SZ];
  char len;
  usecount_t usecount;
  int hashi= *pho >> TSIN_HASH_SHIFT;

  if (hashi >= TSIN_HASH_N)
    return FALSE;

  int top=hashidx[hashi];
  int bot=hashidx[hashi+1];

  if (top>=phcount)
    return FALSE;

  while (top <= bot) {
    mid=(top+bot)/ 2;
    load_tsin_entry(mid, &len, &usecount, ss, stch);

    if (len > plen)
      mlen=plen;
    else
      mlen=len;

    cmp=phokey_t_seq(ss, pho, mlen);

    if (!cmp && len < plen)
      cmp=-2;

    if (cmp>0)
      bot=mid-1;
    else
    if (cmp<0)
      top=mid+1;
    else
      break;
  }

  if (cmp) {
//    dbg("no match %d\n", cmp);
    return FALSE;
  }

  // seek to the first match because binary search is used
  int sti;
  for(sti = mid; sti>=0; sti--) {
    load_tsin_entry(sti, &len, &usecount, stk, stch);

    if (len >= plen && !phokey_t_seq(stk, pho, plen))
      continue;
    break;
  }
  sti++;

  // seek to the tail
  int edi;
  for(edi = mid; edi < phcount; edi++) {
    load_tsin_entry(edi, &len, &usecount, stk, stch);

    if (len >= plen && !phokey_t_seq(stk, pho, plen))
      continue;
    break;
  }

  *r_sti = sti;
  *r_edi = edi;

  return TRUE;
}

// och : orginal och;
void inc_dec_tsin_use_count(phokey_t *pho, char *ch, int N, gboolean b_dec)
{
  int sti, edi;

  if (!tsin_seek(pho, N, &sti, &edi))
    return;

  int idx;
  int tlen = utf8_tlen(ch, N);

#if 0
  dbg("otlen %d  ", tlen);
  int i;
  for(i=0; i < tlen; i++)
    putchar(ch[i]);
  puts("");
#endif

  for(idx=sti; idx < edi; idx++) {
    char len;
    usecount_t usecount, n_usecount;
    phokey_t phi[MAX_PHRASE_LEN];
    char stch[MAX_PHRASE_LEN * CH_SZ];

    load_tsin_entry(idx, &len, &usecount, phi, stch);
    n_usecount = usecount;

    if (len!=N || phokey_t_seq(phi, pho, N))
      break;
#if 0
    for(i=0; i < tlen; i++)
      putchar(stch[i]);
    dbg(" ppp\n");
#endif
    if (!utf8_str_eq(stch, ch, N))
      continue;
#if 0
    dbg("found match\n");
#endif
    int ph_ofs=phidx[idx];
    fseek(fph, ph_ofs + 1, SEEK_SET);

    if (b_dec) {
      if (usecount > -127)
        n_usecount--;
//      dbg("dec %d\n", n_usecount);
    } else {
      if (usecount < 0x3fffffff)
        n_usecount++;
//      dbg("inc %d\n", n_usecount);
    }

    if (n_usecount != usecount) {
      fwrite(&n_usecount, sizeof(usecount_t), 1, fph); // use count
      fflush(fph);
    }
  }
}
