Logo Search packages:      
Sourcecode: zh-autoconvert version File versions  Download package

unicode.c

/*
AutoConvert, a Chinese HZ/GB/Big5 encodings auto-converter
Copyright (C) 1999  ڹ  Yu Guanghui <ygh@dlut.edu.cn>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or any
later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA  02111-1307, USA.
*/
/*
 *hack from hztty-2.0(Yongguang ZHANG,1992 COPYRIGHT)
 *by Yu Guanghui <ygh@dlut.edu.cn>
 *1999.10
 */       
//$Header: /home/prog/autoconvert/hzconvert/unicode.c,v 1.5 2001/04/28 01:45:46 ygh Exp $

#include "hz.h"

/*** UTF8 => UNICODE ***/

static int u8toun(p, len, pu1, pu2)
    unsigned char *p;
    int len;
    unsigned char *pu1, *pu2;
{
  register unsigned char c1 = p[0], c2 = p[1], c3 = p[2];

      if ((c1 & 0x80) == 0) { /* ASCII character */
            *pu1 = 0;
            *pu2 = c1;
            return 1;
      }
      if ((c1 & 0xc0) == 0x80) {    /* unexpected tail character */
            return -1;
      }
      if ((c1 & 0xe0) == 0xc0) {    /* 110vvvvv 10vvvvvv */
            if (len < 2)      /* at least two characters are expected */
                  return 0;   /* dangling */
            if ((c2 & 0xc0) != 0x80)  /* unexpected head character */
                  return -2;
            *pu1 = (c1 & 0x1c) >> 2;
            *pu2 = ((c1 & 0x03) << 6) | (c2 & 0x3f);
            return 2;
      }
      if ((c1 & 0xf0) == 0xe0) {    /* 1110vvvv 10vvvvvv 10vvvvvv */
            if (len < 3)      /* at least three characters are expected */
                  return 0;   /* dangling */
            if ((c2 & 0xc0) != 0x80)  /* unexpected head character */
                  return -2;
            if ((c3 & 0xc0) != 0x80)  /* unexpected head character */
                  return -3;
            *pu1 = ((c1 & 0x0f) << 4) | ((c2 & 0x3c) >> 2);
            *pu2 = ((c2 & 0x03) << 6) | (c3 & 0x3f);
            return 3;
      }
      return -4;
}

static char utf8_uni_savec[MAX_MODULE][2];

int utf8_uni_init (arg)
     char *arg;
{
/*
  static int utf8_uni_inst = 0;

      utf8_uni_savec[utf8_uni_inst][0] = '\0';
      utf8_uni_savec[utf8_uni_inst][1] = '\0';
      return (utf8_uni_inst++);
*/
      int i;
        for(i=0;i<MAX_MODULE;i++){
            utf8_uni_savec[i][0] = '\0';
            utf8_uni_savec[i][1] = '\0';
      }
      return (i++);

}

#define CODE_ERROR      0x80

char *utf8_uni (s,plen,inst)
     char *s;
     int *plen;
     int inst;
{
  char buf[MAX_BUFFER*3];
  char *s_start = s;
  register int len = *plen;
  register char *p = buf;

      if (len == 0)
                return (s);
      if (utf8_uni_savec[inst][0]) {
            if (utf8_uni_savec[inst][1]) {
                  *(--s) = utf8_uni_savec[inst][1];
                  len++;
                  utf8_uni_savec[inst][1] = 0;
            }
            *(--s) = utf8_uni_savec[inst][0];
            len++;
            utf8_uni_savec[inst][0] = 0;
      }
      memcpy(p,s, len);

      s = s_start;
      while (len > 0) {
            int nc = u8toun(p, len, s, s+1);

            if (nc > 0) {
                  s++; s++;
                  p += nc;
                  len -= nc;
            } else if (nc < 0) {    /* bad character */
                  *s++ = 0;
                  *s++ = CODE_ERROR;
                  p += (-nc);
                  len -= (-nc);
            } else {    /* dangling */
                  if (len <= 2){
                        utf8_uni_savec[inst][0] = *p++;
                        len--;
                  }
                  if (len == 1)
                        utf8_uni_savec[inst][1] = *p++;
                  break;
            }
      }
      *plen = s - s_start;
      return s_start;
}

/*** UNICODE => UTF8 ***/

static char uni_utf8_savec[MAX_MODULE];
static char uni_utf8_saved[MAX_MODULE];

int uni_utf8_init (arg)
     char *arg;
{
/*
  static int uni_utf8_inst = 0;

      uni_utf8_saved[uni_utf8_inst] = 0;
      return (uni_utf8_inst++);
*/
      int i;
        for(i=0;i<MAX_MODULE;i++){
            uni_utf8_saved[i] = 0;
      }
      return (i++);
}

char *uni_utf8 (s,plen,inst)
     char *s;
     int *plen;
     int inst;
{
  char buf[MAX_BUFFER*3];
  char *s_start = s;
  register int len = *plen;
  register char *p = buf;

      if (len == 0)
                return (s);
      if (uni_utf8_saved[inst]) {
            *(--s) = uni_utf8_savec[inst];
            len++;
            uni_utf8_saved[inst] = 0;
      }
      memcpy(p,s, len);

      s = s_start;
      while (len >= 2) {
            if ((p[0] == 0) && ((p[1] & 0x80) == 0)) {      /* ASCII */
                  *s++ = p[1];
            } else if ((p[0] & 0xf8) == 0) {    /* 0x0080 - 0x07ff */
                  *s++ = 0xc0 | (p[0] << 2) | ((p[1] >> 6) & 0x03);
                  *s++ = 0x80 | (p[1] & 0x3f);
            } else {                      /* 0x0800 - 0xffff */
                  *s++ = 0xe0 | ((p[0] >> 4) & 0x0f);
                  *s++ = 0x80 | ((p[0] & 0x0f) << 2) |
                              ((p[1] >> 6) & 0x03);
                  *s++ = 0x80 | (p[1] & 0x3f);
            }
            p++;  p++;
            len--;  len--;
      }
      if (len == 1) {         /* dangling */
            uni_utf8_saved[inst] = 1;
            uni_utf8_savec[inst] = *p;
      }
      *plen = s - s_start;
      return s_start;
}

/*********** UTF 7 **************/

/* ASCII subsets */
static      unsigned char     base64[] =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

#define     SAFE_CHARS  "'(),-.:?"
#define     SPACE_CHARS " \t\n\r"

/* character classes */
#define     BASE64            0x01
#define     SAFE        0x02
#define     SPACE       0x04

/* base64 state */
#define     IN_ASCII    0
#define     IN_BASE64   1
#define AFTER_PLUS      2

static      char  inv_base64[128];
static      char  char_type[256];

static void init_utf7_tables()
{
  register unsigned char *s;
  static int first_time = 1;

      if (! first_time)
            return;

      for (s = base64; *s != '\0'; s++) {
            char_type[*s] |= BASE64;
            inv_base64[*s] = s - base64;
      }
      for (s = SAFE_CHARS;  *s != '\0';  s++)
            char_type[*s] |= SAFE;
      for (s = SPACE_CHARS;  *s != '\0';  s++)
            char_type[*s] |= SPACE;

      first_time = 0;
}

struct utf7_context {
    short int state;          /* state in the base64 */
    short int nbits;          /* number of bits in the bit buffer */
    unsigned long bit_buffer;
};

static int u7toun(c, pu1, pu2, ctx) /* return 1 if a unicode is produced */
    unsigned char c;          /* a char in utf7 stream */
    unsigned char *pu1, *pu2; /* the unicode */
    struct utf7_context *ctx; /* the context */
{
      if (ctx->state == IN_ASCII) {
            if (c == '+') {
                  ctx->state = AFTER_PLUS;
                  return 0;
            } else {
                  *pu1 = 0;  *pu2 = c;
                  return 1;
            }
      }
      if (ctx->state == AFTER_PLUS) {
            if (c == '-') {
                  *pu1 = 0;  *pu2 = '+';
                  return 1;
            } else {
                  ctx->state = IN_BASE64;
                  ctx->nbits = 0;
                  /* don't return yet, continue to the IN_BASE64 mode */
            }
      }

      /* now we're in Base64 mode */
      if (char_type[c]&BASE64) {
            ctx->bit_buffer <<= 6;
            ctx->bit_buffer |= inv_base64[c];
            ctx->nbits += 6;
            if (ctx->nbits >= 16) {
                  ctx->nbits -= 16;
                  *pu1 = (char)((ctx->bit_buffer >> (ctx->nbits + 8))
                              & 0x00ff);
                  *pu2 = (char)((ctx->bit_buffer >>  ctx->nbits     )
                              & 0x00ff);
                  return 1;
            }
            return 0;
      }
      ctx->state = IN_ASCII;
      if (c != '-') {
            *pu1 = 0;  *pu2 = c;
            return 1;
      }
      return 0;
}

static int untou7(u1, u2, s, ctx)   /* return the # of char written to s */
    unsigned char u1, u2;     /* the unicode */
    unsigned char *s;         /* store here */
    struct utf7_context *ctx; /* the context */
{
  unsigned char *prev_s = s;

      if ( ((u1 == 0) && ((u2 & 0x80) == 0)) &&
           (char_type[u2] & (BASE64|SAFE|SPACE)) ) {  /* safe ASCII */
            if (ctx->state == IN_BASE64) {
                  if (ctx->nbits > 0)
                        *s++ = base64[
                            (ctx->bit_buffer<<(6 - ctx->nbits)) & 0x3f
                        ];
                  if ((char_type[u2] & BASE64) || u2 == '-')                                    *s++ = '-';
                  ctx->state = IN_ASCII;
            }
            *s++ = u2;
            if (u2 == '+')
                  *s++ = '-';
      } else {
            if (ctx->state == IN_ASCII) {
                  *s++ = '+';
                  ctx->state = IN_BASE64;
                  ctx->nbits = 0;
            }
            ctx->bit_buffer <<= 8;
            ctx->bit_buffer |= u1;
            ctx->bit_buffer <<= 8;
            ctx->bit_buffer |= u2;
            ctx->nbits += 16;
            while (ctx->nbits >= 6) {
                  ctx->nbits -= 6;
                  *s++ = base64[(ctx->bit_buffer >> ctx->nbits) & 0x3f];
            }
      }
      return (s - prev_s);    /* return the length */
}

/***** UTF7 => UNICODE *****/

static struct utf7_context utf7_uni_context[MAX_MODULE];

int utf7_uni_init (arg)
     char *arg;
{
/*
  static int utf7_uni_inst = 0;

      utf7_uni_context[utf7_uni_inst].state = IN_ASCII;
      utf7_uni_context[utf7_uni_inst].nbits = 0;
      utf7_uni_context[utf7_uni_inst].bit_buffer = 0L;
      return (utf7_uni_inst++);
*/

      int i;
      init_utf7_tables();
        for(i=0;i<MAX_MODULE;i++){
            utf7_uni_context[i].state = IN_ASCII;
            utf7_uni_context[i].nbits = 0;
            utf7_uni_context[i].bit_buffer = 0L;
      }
      return (i++);
}

char *utf7_uni (s,plen,inst)
     char *s;
     int *plen;
     int inst;
{
  char buf[MAX_BUFFER*3];
  char *s_start = s;
  register int len = *plen;
  register char *p = buf;

      if (len == 0)
                return (s);
      memcpy(p,s, len);

      while (len-- > 0) {
            if (u7toun(*p++, s, s+1, &(utf7_uni_context[inst]))) {
                  s++;  s++;
            }
      }
      *plen = s - s_start;
      return s_start;
}

/***** UNICODE => UTF7 *****/

static char uni_utf7_savec[MAX_MODULE];
static char uni_utf7_saved[MAX_MODULE];
static struct utf7_context uni_utf7_context[MAX_MODULE];

int uni_utf7_init (arg)
     char *arg;
{
/*
  static int uni_utf7_inst = 0;

      uni_utf7_saved[uni_utf7_inst] = 0;
      uni_utf7_context[uni_utf7_inst].state = IN_ASCII;
      uni_utf7_context[uni_utf7_inst].nbits = 0;
      uni_utf7_context[uni_utf7_inst].bit_buffer = 0L;
      return (uni_utf7_inst++);
*/
      int i;
      init_utf7_tables();
        for(i=0;i<MAX_MODULE;i++){
            uni_utf7_saved[i] = 0;
            uni_utf7_context[i].state = IN_ASCII;
            uni_utf7_context[i].nbits = 0;
            uni_utf7_context[i].bit_buffer = 0L;
      }
      return (i++);
}

char *uni_utf7 (s,plen,inst)
     char *s;
     int *plen;
     int inst;
{
  char buf[MAX_BUFFER*3];
  char *s_start = s;
  register int len = *plen;
  register char *p = buf;
  
      if (len == 0)
                return (s);
      if (uni_utf7_saved[inst]) {
            *(--s) = uni_utf7_savec[inst];
            len++;
            uni_utf7_saved[inst] = 0;
      }
      memcpy(p,s, len);

      s = s_start;
      while (len >= 2) {
            s += untou7(p[0], p[1], s, &(uni_utf7_context[inst]));
            len --;  len --;
            p++;  p++;
      }
      if (len == 1) {         /* dangling */
            uni_utf7_saved[inst] = 1;
            uni_utf7_savec[inst] = *p;
      }
      *plen = s - s_start;
      return s_start;
}


Generated by  Doxygen 1.6.0   Back to index