/*======================================================================*\
|*		Editor mined						*|
|*		CJK character set <-> Unicode mapping tables		*|
\*======================================================================*/

#include "mined.h"
#include "charmaps.h"


/*======================================================================*\
|*			Character properties				*|
\*======================================================================*/

/**
   Check if character is a control character in current encoding.
   (Should be more generic...)
 */
int
iscontrol (c)
  character c;
{
  if (mapped_text) {
	unsigned long u = lookup_cjk (c);
	return u == '\177' || (u > 0 && u < ' ');
  } else if (pc_term || utf8_text || cjk_text) {
	return c == '\177' || c < ' ';
  } else {
	return c == '\177' || (c & '\177') < ' ';
  }
}

/**
   Return display indication for a control character.
 */
character
controlchar (c)
  character c;
{
  if (c == '\177') {
	return '?';
  } else {
	return c + '@';
  }
}


/**
   Return the isolated form of an ALEF character.
 */
unsigned long
isolated_alef (unichar)
  unsigned long unichar;
{
	if (unichar == 0x0622) {
		/* ALEF WITH MADDA ABOVE */
		return 0xFE81;
	} else if (unichar == 0x0623) {
		/* ALEF WITH HAMZA ABOVE */
		return 0xFE83;
	} else if (unichar == 0x0625) {
		/* ALEF WITH HAMZA BELOW */
		return 0xFE87;
	} else if (unichar == 0x0627) {
		/* ALEF */
		return 0xFE8D;
	} else {
		/* ? -> ALEF SYMBOL */
		return 0x2135;
	}
}


/**
   Return max value in current encoding.
 */
unsigned long
max_char_value ()
{
  if (cjk_text) switch (cjk_encoding) {
	case 'G': return 0xFFFFFFFF;
	case 'C': return 0x8EFFFFFF;
	case 'J': return 0x8FFFFF;
	default: return 0xFFFF;
  } else if (utf8_text) {
	return 0x7FFFFFFF;
  } else {
	return 0xFF;
  }
}


/**
   Convert CJK character to byte sequence.
 */
int
cjkencode (cjkchar, buf)
  unsigned long cjkchar;
  character * buf;
{
  int len = 0;
  int i;

  if (cjkchar >= 0x1000000) {
	i = (cjkchar >> 16) & 0xFF;
	if (cjk_encoding == 'G' && cjkchar >= 0x80000000
	 && i >= '0' && i <= '9') {
		len = 4;
	} else if (cjk_encoding == 'C' && (cjkchar >> 24) == 0x8E) {
		len = 4;
	}
  } else if (cjkchar >= 0x10000) {
	if (cjk_encoding == 'J' && (cjkchar >> 16) == 0x8F) {
		len = 3;
	}
  } else if (cjkchar >= 0x8000 && (cjkchar & 0xFF) > 0 &&
		multichar ((character) (cjkchar >> 8))) {
	len = 2;
  } else if (cjkchar < 0x100 && ! multichar (cjkchar)) {
	len = 1;
  }

  for (i = len - 1; i >= 0; i --) {
	buf [i] = cjkchar & 0xFF;
	cjkchar = cjkchar >> 8;
	if (buf [i] == '\0') {
		len = 0;
	}
  }
  buf [len] = '\0';

  return len;
}

/**
   convert Unicode character to UTF-8
 */
int
uniUTF (unichar, buf)
  unsigned long unichar;
  character * buf;
{
  int len;

  if (unichar < 0x80) {
	len = 1;
	* buf ++ = unichar;
  } else if (unichar < 0x800) {
	len = 2;
	* buf ++ = 0xC0 | (unichar >> 6);
	* buf ++ = 0x80 | (unichar & 0x3F);
  } else if (unichar < 0x10000) {
	len = 3;
	* buf ++ = 0xE0 | (unichar >> 12);
	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
	* buf ++ = 0x80 | (unichar & 0x3F);
  } else if (unichar < 0x200000) {
	len = 4;
	* buf ++ = 0xF0 | (unichar >> 18);
	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
	* buf ++ = 0x80 | (unichar & 0x3F);
  } else if (unichar < 0x4000000) {
	len = 5;
	* buf ++ = 0xF8 | (unichar >> 24);
	* buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
	* buf ++ = 0x80 | (unichar & 0x3F);
  } else if (unichar < 0x80000000) {
	len = 6;
	* buf ++ = 0xFC | (unichar >> 30);
	* buf ++ = 0x80 | ((unichar >> 24) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 18) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 12) & 0x3F);
	* buf ++ = 0x80 | ((unichar >> 6) & 0x3F);
	* buf ++ = 0x80 | (unichar & 0x3F);
  } else {
	len = 0;
  }
  * buf = '\0';
  return len;
}

/**
   Convert character to byte sequence.
 */
char *
encode_char (c)
  unsigned long c;
{
  static char buf [7];
  if (utf8_text) {
	(void) uniUTF (c, buf);
  } else if (cjk_text) {
	(void) cjkencode (c, buf);
  } else {
	buf [0] = (character) c;
	buf [1] = '\0';
  }
  return buf;
}


/**
   Check if a CJK character is encoded in the defined code range of 
   the active encoding.
 */
FLAG
valid_cjk (cjkchar, cjkbytes)
  unsigned long cjkchar;
  character * cjkbytes;
{
  character cjkbuf [5];

  if (cjkchar < 0x80) {
	return True;
  }

  if (cjkbytes == 0) {
	cjkbytes = cjkbuf;
	(void) cjkencode (cjkchar, cjkbytes);
  }

/*
         GB 18030 >> GBK >> GB 2312-80
                        GB18030	*?	81-FE	40-7E, 80-FE
                        		81-FE	30-39	81-FE	30-39
         Big5-HKSCS >> Big5
                        Big5	!	A1-FE	40-7E, A1-FE
                       (Big5+	!	89-FE	40-7E, A1-FE)
                        Big5+	!	88-FE	40-7E, A1-FE
         CNS
         EUC-TW -> CNS 11643-1992
                        EUC-TW	-	A1-FE	A1-FE
                        		8E	A1-A7	A1-FE	A1-FE
*/
  switch (cjk_encoding) {
    case 'G':	if (cjkchar > 0xFFFF) {
			return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
				&& cjkbytes [1] >= '0' && cjkbytes [1] <= '9'
				&& cjkbytes [2] >= 0x81 && cjkbytes [2] <= 0xFE
				&& cjkbytes [3] >= '0' && cjkbytes [3] <= '9';
		} else {
			return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
				&& cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFE
				&& cjkbytes [1] != 0x7F;
		}
    case 'B':	return cjkbytes [0] >= 0x88 && cjkbytes [0] <= 0xFE
				&& ((cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0x7E)
				    ||
				    (cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE)
				   )
				&& cjkbytes [2] == 0;
    case 'C':	return (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
			&& cjkbytes [2] == 0)
			||
			(cjkbytes [0] == 0x8E
			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xAF
			&& cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
			&& cjkbytes [3] >= 0xA1 && cjkbytes [3] <= 0xFE);
/*
         EUC-JP -> JIS X 0208 + JIX X 0212
                        EUC-JP	!*	A1-FE	A1-FE
                        		8F	A1-FE	A1-FE
                        		8E	A1-DF
         Shift-JIS 0213 >> Shift-JIS
                        Shift-JIS	A1-DF
                        		81-9F, E0-EF	40-7E, 80-FC
*/
    case 'J':	return (cjkbytes [0] >= 0xA1 && cjkbytes [0] <= 0xFE
			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
			&& cjkbytes [2] == 0
			)
			||
			(cjkbytes [0] == 0x8E
			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xDF
			&& cjkbytes [2] == 0)
			||
			(cjkbytes [0] == 0x8F
			&& cjkbytes [1] >= 0xA1 && cjkbytes [1] <= 0xFE
			&& cjkbytes [2] >= 0xA1 && cjkbytes [2] <= 0xFE
			&& cjkbytes [3] == 0);
    case 'S':	return (cjkchar >= 0xA1 && cjkchar <= 0xDF)
			|| (((cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0x9F)
				||
			     (cjkbytes [0] >= 0xE0)
			    )
			    && cjkbytes [1] >= 0x40 && cjkbytes [1] <= 0xFC
			    && cjkbytes [1] != 0x7F
			    && cjkbytes [2] == 0);
/*
         UHC >> KSC
                        EUC-KR	!	A1-FE	A1-FE
                        UHC	*	81-FE	41-5A, 61-7A, 81-FE
         Johab
                        Johab		84-DE, E0-F9	31-7E, 81-FE
*/
    case 'K':	return cjkbytes [0] >= 0x81 && cjkbytes [0] <= 0xFE
			&& ((cjkbytes [1] >= 0x41 && cjkbytes [1] <= 0x5A)
			    ||
			    (cjkbytes [1] >= 0x61 && cjkbytes [1] <= 0x7A)
			    ||
			    (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
			   )
			&& cjkbytes [2] == 0;
    case 'H':	return ((cjkbytes [0] >= 0x84 && cjkbytes [0] <= 0xDE)
			 ||
			(cjkbytes [0] >= 0xE0 && cjkbytes [0] <= 0xF9)
			)
			&&
			((cjkbytes [1] >= 0x31 && cjkbytes [1] <= 0x7E)
			 ||
			 (cjkbytes [1] >= 0x81 && cjkbytes [1] <= 0xFE)
			)
			&& cjkbytes [2] == 0;
    default:	return False;
  }
}


/*======================================================================*\
	Conversion tables mapping various CJK encodings to Unicode
\*======================================================================*/

extern struct cjk_table_entry big5_table [];
extern unsigned int big5_table_len;
extern struct cjk_table_entry gb_table [];
extern unsigned int gb_table_len;
extern struct cjk_table_entry cns_table [];
extern unsigned int cns_table_len;
extern struct cjk_table_entry ejis_table [];
extern unsigned int ejis_table_len;
extern struct cjk_table_entry sjis_table [];
extern unsigned int sjis_table_len;
extern struct cjk_table_entry uhc_table [];
extern unsigned int uhc_table_len;
extern struct cjk_table_entry johb_table [];
extern unsigned int johb_table_len;
extern struct cjk_table_entry viscii_table [];
extern unsigned int viscii_table_len;
extern struct cjk_table_entry tcvn_table [];
extern unsigned int tcvn_table_len;
extern struct cjk_table_entry tis620_table [];
extern unsigned int tis620_table_len;
extern struct cjk_table_entry koi8_ru_table [];
extern unsigned int koi8_ru_table_len;
extern struct cjk_table_entry m_roman_table [];
extern unsigned int m_roman_table_len;
extern struct cjk_table_entry windows_table [];
extern unsigned int windows_table_len;


/*======================================================================*\
|*			Mapping tables and functions			*|
\*======================================================================*/

#ifdef use_cjk_tables

static unsigned int jis2 [] = 
	{0x309A, 0x0300, 0x0301, 0x02E5, 0x02E9};

/**
   Current CJK encoding indication and flag
 */
char cjk_encoding = '-';
char * cjk_encoding_flag = "??";

/**
   Current CJK/Unicode mapping table and its length
 */
static struct cjk_table_entry * cjk_table = (struct cjk_table_entry *) 0;
static unsigned int cjk_table_len = 0;

static
void
set_cjk_encoding (tag, flag)
  char tag;
  char * flag;
{
  cjk_encoding = tag;
  cjk_encoding_flag = flag;
}

/**
   Return encoding tag.
 */
char
get_char_encoding ()
{
  if (utf8_text) {
	return 'U';
  } else if (cjk_text || mapped_text) {
	return cjk_encoding;
  } else {
	return 'L';
  }
}

static FLAG combined_text;

/**
   Return True if active encoding has combining characters.
 */
FLAG
encoding_has_combining ()
{
  return utf8_text
	|| (cjk_text && 
		(cjk_encoding == 'G'
		 || cjk_encoding == 'J'
		 || cjk_encoding == 'S'))
	|| (mapped_text && combined_text);
}

/**
   Determine if active encoding has combining characters.
 */
static
void
check_combining ()
{
  unsigned long i;
  combined_text = False;
  for (i = 0; i < 0x100; i ++) {
	if (iscombining (lookup_cjk (i))) {
		combined_text = True;
		return;
	}
  }
}

/**
   Set character mapping table and text encoding variables according 
   to encoding tag.
   Return True on success, False if tag unknown.
 */
FLAG
set_char_encoding (tag)
  char tag;
{
  utf8_text = False;
  cjk_text = False;
  mapped_text = False;
  switch (tag) {
	case 'U':
		utf8_text = True;
		return True;
	case 'L':
		return True;
	case 'B':
		set_cjk_encoding (tag, "B5");
		cjk_table = big5_table;
		cjk_table_len = big5_table_len;
		cjk_text = True;
		return True;
	case 'G':
		set_cjk_encoding (tag, "GB");
		cjk_table = gb_table;
		cjk_table_len = gb_table_len;
		cjk_text = True;
		return True;
	case 'C':
		set_cjk_encoding (tag, "CN");
		cjk_table = cns_table;
		cjk_table_len = cns_table_len;
		cjk_text = True;
		return True;
	case 'J':
		set_cjk_encoding (tag, "JP");
		cjk_table = ejis_table;
		cjk_table_len = ejis_table_len;
		cjk_text = True;
		return True;
	case 'S':
		set_cjk_encoding (tag, "sJ");
		cjk_table = sjis_table;
		cjk_table_len = sjis_table_len;
		cjk_text = True;
		return True;
	case 'K':
		set_cjk_encoding (tag, "KR");
		cjk_table = uhc_table;
		cjk_table_len = uhc_table_len;
		cjk_text = True;
		return True;
	case 'H':
		set_cjk_encoding (tag, "Jh");
		cjk_table = johb_table;
		cjk_table_len = johb_table_len;
		cjk_text = True;
		return True;
	case 'V':
		set_cjk_encoding (tag, "VI");
		cjk_table = viscii_table;
		cjk_table_len = viscii_table_len;
		mapped_text = True;
		check_combining ();
		return True;
	case 'N':
		set_cjk_encoding (tag, "TC");
		cjk_table = tcvn_table;
		cjk_table_len = tcvn_table_len;
		mapped_text = True;
		check_combining ();
		return True;
	case 'T':
		set_cjk_encoding (tag, "TI");
		cjk_table = tis620_table;
		cjk_table_len = tis620_table_len;
		mapped_text = True;
		check_combining ();
		return True;
	case 'Y':
		set_cjk_encoding (tag, "CY");
		cjk_table = koi8_ru_table;
		cjk_table_len = koi8_ru_table_len;
		mapped_text = True;
		check_combining ();
		return True;
	case 'M':
		set_cjk_encoding (tag, "MR");
		cjk_table = m_roman_table;
		cjk_table_len = m_roman_table_len;
		mapped_text = True;
		check_combining ();
		return True;
	case 'W':
		set_cjk_encoding (tag, "WA");
		cjk_table = windows_table;
		cjk_table_len = windows_table_len;
		mapped_text = True;
		check_combining ();
		return True;
  }
  return False;
}

#ifdef split_map_entries
/*
   Decode CJK character value from split table entry.
 */
static
unsigned int
decode_cjk (entrypoi)
  struct cjk_table_entry * entrypoi;
{
  if (cjk_encoding == 'G') {
	if ((unsigned int) entrypoi->cjk_ext == 0xFF) {
		return entrypoi->cjk_base;
	} else {
		return ((entrypoi->cjk_base & 0x00FF) << 24)
			| (entrypoi->cjk_base & 0xFF00)
			| 0x00300030
			| ((((unsigned int) entrypoi->cjk_ext) & 0xF0) << 12)
			| (((unsigned int) entrypoi->cjk_ext) & 0x0F);
	}
  } else {
	if ((unsigned int) entrypoi->cjk_ext >= 0x90) {
		return 0x8E000000 | (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
	} else {
		return (((unsigned int) entrypoi->cjk_ext) << 16) | entrypoi->cjk_base;
	}
  }
}
#endif

/*
   Look up a Unicode value in a character set mapping table.
   @return CJK value, or CHAR_INVALID if not found
 */
static
unsigned long
unmap_char (unichar)
  unsigned long unichar;
{
#ifdef split_map_entries
	unsigned char unichar_high = unichar >> 16;
	unsigned short unichar_low = unichar & 0xFFFF;
#endif
	unsigned int i = 0;

	struct cjk_table_entry * cjk_table_poi = cjk_table;
	while (i ++ < cjk_table_len) {
#ifdef split_map_entries
		if (
		    unichar_low == cjk_table_poi->unicode_low
		 && unichar_high == cjk_table_poi->unicode_high
		   ) {
			return decode_cjk (cjk_table_poi);
		}
#else
		if (
		    unichar == cjk_table_poi->unicode
		   ) {
			return cjk_table_poi->cjk;
		}
#endif
		cjk_table_poi ++;
	}
	return CHAR_INVALID;
}

/*
   Map a character in a character set mapping table.
   @return Unicode value, or CHAR_INVALID if not found
 */
static
unsigned int
map_char (cjk)
  unsigned int cjk;
{
	int low = 0;
	int high = cjk_table_len - 1;
	int i;

	unsigned int cjki;

	while (low <= high) {
		i = (low + high) / 2;
#ifdef split_map_entries
		cjki = decode_cjk (& cjk_table [i]);
#else
		cjki = cjk_table [i].cjk;
#endif
		if (cjki == cjk) {
#ifdef split_map_entries
			if ((cjk_encoding == 'J' || cjk_encoding == 'S')
			 && (cjk_table [i].unicode_high & 0x80)
			   ) {
				return 0x80000000 | (jis2 [cjk_table [i].unicode_high & 0x7F] << 16) | (cjk_table [i].unicode_low);
			} else {
				return (((unsigned int) cjk_table [i].unicode_high) << 16) | (cjk_table [i].unicode_low);
			}
#else
			if ((cjk_encoding == 'J' || cjk_encoding == 'S')
			 && (cjk_table [i].unicode & 0x800000)
			   ) {
				return 0x80000000 | (jis2 [(cjk_table [i].unicode >> 16) & 0x7F] << 16) | (cjk_table [i].unicode & 0xFFFF);
			} else {
				return cjk_table [i].unicode;
			}
#endif
		} else if (cjki >= cjk) {
			high = i - 1;
		} else {
			low = i + 1;
		}
	}
	return 0;
}

#else

char cjk_encoding = '-';
char * cjk_encoding_flag = "--";

/**
   Return encoding tag.
 */
char
get_char_encoding ()
{
  if (utf8_text) {
	return 'U';
  } else {
	return 'L';
  }
}

/**
   Return True if active encoding has combining characters.
 */
FLAG
encoding_has_combining ()
{
	return False;
}

FLAG
set_char_encoding (tag)
  char tag;
{
	return False;
}

static
unsigned long
unmap_char (unichar)
  unsigned long unichar;
{
	return CHAR_INVALID;
}

static
unsigned int
map_char (cjk)
  unsigned int cjk;
{
	return CHAR_INVALID;
}

#endif


/*======================================================================*\
|*		Conversion functions					*|
\*======================================================================*/

/**
   GB18030 algorithmic mapping part
 */
static
unsigned long
gb_to_unicode (gb)
  unsigned long gb;
{
	unsigned int byte2 = (gb >> 16) & 0xFF;
	unsigned int byte3 = (gb >> 8) & 0xFF;
	unsigned int byte4 = gb & 0xFF;
	if (byte2 < '0' || byte2 > '9' || byte3 < 0x81 || byte4 < '0' || byte4 > '9') {
		return 0;
	}
	return (((((gb >> 24) & 0xFF) - 0x90) * 10
		+ (byte2 - 0x30)) * 126L
		+ (byte3 - 0x81)) * 10L
		+ (byte4 - 0x30)
		+ 0x10000;
}

static
unsigned long
unicode_to_gb (uc)
  unsigned long uc;
{
	unsigned int a, b, c, d;
	uc -= 0x10000;
	d = 0x30 + uc % 10;
	uc /= 10;
	c = 0x81 + uc % 126;
	uc /= 126;
	b = 0x30 + uc % 10;
	uc /= 10;
	a = 0x90 + uc;
	return (a << 24) | (b << 16) | (c << 8) | d;
}

/*
   cjk () converts a Unicode value into a CJK encoded character.
   May also be used for single-byte mapped character sets.
 */
unsigned long
cjk (unichar)
  unsigned long unichar;
{
	unsigned long cjkchar;

	if (cjk_encoding == 'G' && unichar >= 0x10000) {
		return unicode_to_gb (unichar);
	}

	cjkchar = unmap_char (unichar);
	if (cjkchar != CHAR_INVALID) {
		return cjkchar;
	}

	if (unichar < 0x20) {
		/* transparently return control range (for commands) */
		return unichar;
	} else if (unichar < 0x80) {
		/* transparently map ASCII range unless remapped */
		cjkchar = unichar;
		unichar = map_char (cjkchar);
		if (unichar > 0 && unichar != cjkchar) {
			return CHAR_INVALID;
		} else {
			return cjkchar;
		}
	} else {
		/* notify "not found" */
		return CHAR_INVALID;
	}
}

/*
   lookup_cjk () converts a CJK encoded character into a Unicode value.
   May also be used for single-byte mapped character sets.
 */
unsigned long
lookup_cjk (cjk)
  unsigned long cjk;
{
	unsigned long unichar;

	if (cjk_encoding == 'G' && cjk >= 0x90000000) {
		return gb_to_unicode (cjk);
	}

	unichar = map_char (cjk);
	if (unichar > 0) {
		return unichar;
	} else if (cjk < 0x80) {
		/* transparently map ASCII range */
		return cjk;
	} else {
		/* notify "not found" */
		return 0;
	}
}


/*======================================================================*\
|*				End					*|
\*======================================================================*/
