Logo Search packages:      
Sourcecode: unicon version File versions  Download package

txt2tab.c

/* $Id$ */
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pinyin.h>
#include "safestring.h"

typedef struct _HzPhrase
{
  u_char hz[MAX_PHRASE_LEN * 2 + 1];
  u_char freq;
  struct _HzPhrase *next;
}
HzPhrase;

typedef struct _KeyPhrase
{
  u_char len;
  u_char key[2 * MAX_PHRASE_LEN + 1];
  int count;                  // number of Phrase items, in file it should be u_char
  HzPhrase *hzph;
  struct _KeyPhrase *next;
}
KeyPhrase, *PKeyPhrase;

PinYin pytab[26][MAX_EACH_PY];
u_char hztab[MAX_PY_NUM][MAX_EACH_HZ];
PKeyPhrase phtab[MAX_PY_NUM];
u_short phcount[MAX_PY_NUM];

//fill the hztab and pytab structures
int
LoadTable (char *pathname)
{
  FILE *stream;
  char str[250], *strpy, *strhz;
  int i = 1, j = 0, lastpy = 0, curpy;

  if ((stream = fopen (pathname, "r")) == NULL)
    {
      fprintf (stderr, "%s file not found\n", pathname);
      exit (1);
    }

  while (!feof (stream))
    {
      if (fgets (str, 250, stream) != NULL)
      {
        strpy = strtok (str, " \f\n\r\t\v");
        strhz = strtok (NULL, " \f\n\r\t\v");
        safe_strncpy (hztab[i], strhz, MAX_EACH_HZ);

        curpy = strpy[0] - 'a';
        if (curpy != lastpy)
          j = 0;
        safe_strncpy (pytab[curpy][j].py, strpy, MAX_PY_LEN);
        pytab[curpy][j].key = i;
        lastpy = curpy;
        i++, j++;
      }
    }
  fclose (stream);
  return 0;
}

/* divide the string strbuf into string arrays according to space and Tab */

int
String2Array (char *strbuf, int len, char strarr[][len])
{
  int i = 0, cursor = 0, count = 0, buflen = strlen (strbuf);

  while (i < buflen)
    {
      while (i < buflen && (strbuf[i] == ' ' || strbuf[i] == '\011'))
      i++;              // skip space
      cursor = i;
      while (i < buflen && strbuf[i] != ' ' && strbuf[i] != '\011')
      i++;              // skip non-space
      if (i > cursor)
      {
        strncpy (strarr[count], strbuf + cursor, i - cursor);
        strarr[count++][i - cursor] = '\0';
      }
    }
  return count;
}

int hzlen[10];
int
SavePhraseToMem (char *str, u_char * key, u_char len, u_char freq)
{
  PKeyPhrase kph, tmpkph;
  HzPhrase *hzph;
  int first;
  short ahead;

  if (len < 2)
    return 0;
  /* single char phrase ignored */
  if (len > MAX_PHRASE_LEN)
    {
      fprintf (stderr, "buffer overrun\n");
      abort ();
    }

  ahead = (short) key[1];
  ahead |= (key[0] & 0x01) << 8;

  kph = phtab[ahead];
  if (kph != NULL)            // first phrase of this pinyin
    {
      first = 1;
      do
      {
        if (first)
          first = 0;
        else
          kph = kph->next;

        /* find the matched pinyin keyphrase */
        if (kph->len == len && !memcmp (kph->key, key, len + 1))
          {
            for (hzph = kph->hzph; hzph != NULL; hzph = hzph->next)
            if (!memcmp (hzph->hz, str, 2 * len))     // same phrase
              {
                fprintf (stderr,
                       "Duplicate phrase %s detected, ignored!\n",
                       hzph->hz);
                return 0;
              }

            hzph = kph->hzph;
            while (hzph->next != NULL)
            hzph = hzph->next;      // reach the end of the link list

            if ((hzph->next = (HzPhrase *) malloc (sizeof (HzPhrase))) ==
              NULL)
            {
              fprintf (stderr, "no enough memory\n");
              exit (1);
            }
            kph->count++;
            hzph = hzph->next;
            hzph->freq = 0;
            hzph->next = NULL;
            memcpy (hzph->hz, str, len * 2);    /* len < MAX_PHRASE_LEN */
            hzph->hz[len * 2] = '\0';
            return 1;         // insert a new Hanzi Phrase at the end of the link list
          }
      }
      while (kph->next != NULL);
    }

  // not found , no matched pinyin keyphrase, allocate a new one
  if ((tmpkph = (KeyPhrase *) malloc (sizeof (KeyPhrase))) == NULL)
    {
      fprintf (stderr, "no enough memory\n");
      exit (1);
    }
  if (phtab[ahead] == NULL)
    phtab[ahead] = tmpkph;
  else
    kph->next = tmpkph;

  tmpkph->len = len;
  tmpkph->count = 1;
  memcpy (tmpkph->key, key, len + 1);     /* len < MAX_PHRASE_LEN */
  tmpkph->next = NULL;

  if ((tmpkph->hzph = (HzPhrase *) malloc (sizeof (HzPhrase))) == NULL)
    {
      fprintf (stderr, "no enough memory\n");
      exit (1);
    }

  tmpkph->hzph->freq = freq;
  tmpkph->hzph->next = NULL;
  memcpy (tmpkph->hzph->hz, str, len * 2);      /* len < MAX_PHRASE_LEN */
  tmpkph->hzph->hz[len * 2] = '\0';
  phcount[ahead]++;

  hzlen[len]++;
  return 1;
}

int max_count = 0;
int file_size = 0;

int
SavePhraseToFile (char *pathname)
{
  FILE *out;
  KeyPhrase *kph, *kphtmp;
  HzPhrase *hzph, *hzphtmp;
  u_char key[MAX_PHRASE_LEN + 1], len, count, freq, size;
  unsigned int j = 0, k = 0, itemcount = 0;

  if ((out = fopen (pathname, "wb")) == NULL)
    {
      fprintf (stderr, "%s cant open.\n", pathname);
      exit (1);
    }

  for (j = 1; j < MAX_PY_NUM; j++)
    {
      kph = phtab[j];
      file_size += 2;         //u_short

      if ((count = strlen (hztab[j]) / 2) > 0)
      {
        phcount[j]++;
        fwrite (&(phcount[j]), sizeof (phcount[j]), 1, out);

        // output chars
        len = 1;
        fwrite (&len, sizeof (len), 1, out);
        key[0] = j >> 8;
        key[1] = j & 0xFF;
        fwrite (&count, sizeof (count), 1, out);
        fwrite (key, sizeof (char), 2, out);

        for (k = 0; k < count; k++)
          {
            fwrite (&(hztab[j][k * 2]), sizeof (char), 2, out);
            freq = 0;
            fwrite (&freq, sizeof (freq), 1, out);
          }
        file_size += SizeOfPhrase (1, count);
      }
      else
      fwrite (&(phcount[j]), sizeof (phcount[j]), 1, out);

      while (kph != NULL)
      {
        hzph = kph->hzph;
        kphtmp = kph;
        kph = kph->next;

        len = kphtmp->len;
        if (len > MAX_PHRASE_LEN)
          {
            fprintf (stderr, "buffer overrun\n");
            abort ();
          }
        memcpy (key, kphtmp->key, len + 1);
        fwrite (&len, sizeof (char), 1, out);

        size = (u_char) kphtmp->count;
        if (kphtmp->count > max_count)
          max_count = kphtmp->count;

        if (kphtmp->count > 255)
          {
            fprintf (stderr, "Phrase Count = %d > 255, error!!!\n",
                   kphtmp->count);
            exit (1);
          }
        fwrite (&size, sizeof (size), 1, out);
        fwrite (key, sizeof (char), len + 1, out);

        if (kphtmp->count > max_count)
          max_count = kphtmp->count;
        /* len, key[len+1], count, phrase, freq , phrase, freq ... */

        file_size += SizeOfPhrase (len, kphtmp->count);

        while (hzph != NULL)
          {
            hzphtmp = hzph;
            hzph = hzph->next;

            itemcount++;
            fwrite (hzphtmp->hz, sizeof (char), len * 2, out);
            fwrite (&(hzphtmp->freq), sizeof (hzphtmp->freq), 1, out);
            free (hzphtmp);
          }
        free (kphtmp);
      }
    }

  fwrite (&file_size, sizeof (file_size), 1, out);
  printf ("FileSize=%d\tTotalItem=%d\n\n", file_size + sizeof (int),
        itemcount);
  fclose (out);
  return 1;
}

int
LoadPhraseFromFile (char *pathname)
{
  FILE *stream;
  int i, j;
  char str[250];
  u_char len;
  u_char key[MAX_PHRASE_LEN + 1];
  unsigned short pykey[MAX_PHRASE_LEN];
  int count, ahead, flag = 0, freq;
  char strarr[MAX_PHRASE_LEN + 4][2 * MAX_PHRASE_LEN + 1];

  if ((stream = fopen (pathname, "r")) == NULL)
    {
      fprintf (stderr, "Couldn't open %s.\n", pathname);
      exit (1);
    }

  while (!feof (stream))
    {
      if (fgets (str, 250, stream) != NULL)
      {
        str[strlen (str) - 1] = '\0';
        count = String2Array (str, 2 * MAX_PHRASE_LEN + 1, strarr);
        len = strlen (strarr[0]) / 2;
        /* len+1 = count, freq = 0
           len+2 = count, freq = xx */
        if ((len != count - 1 && len != count - 2) || len > MAX_PHRASE_LEN)
          {
            fprintf (stderr, "Phrase %s error!!!\n", str);
            continue;
          }

        if (len == count - 2)
          {
            freq = atoi (strarr[count - 1]);
            if (freq > 255)
            freq = 255;
            count--;
          }
        else
          freq = 0;

        for (i = 1; i < count; i++)
          {
            ahead = (int) strarr[i][0] - 'a';
            flag = 0;
            if (ahead < 0 || ahead > 25)
            {
              fprintf (stderr, "Phrase %s error!!!\n", str);
              break;
            }

            for (j = 0; pytab[ahead][j].key; j++)
            {
              if (!strcmp (pytab[ahead][j].py, strarr[i]))
                {
                  pykey[i - 1] = pytab[ahead][j].key;
                  flag = 1;
                  break;
                }
            }
            if (!flag)
            break;
          }             // for

        if (!flag)
          {
            fprintf (stderr, "Phrase %s error!!!\n", str);
            continue;
          }
        for (i = 0; i < len; i++)
          key[i + 1] = pykey[i] & 0xff;

        key[0] = '\0';
        for (i = 0; i < len; i++)
          key[0] |= (pykey[i] & 0x0100) >> (8 - i);

        /*
           printf("%s, len=%d, key0=%d, key1 =%d, key=%d\n",
           str,len,(int)key[0],(int)key[1],(int)key[2]);
         */

        SavePhraseToMem (str, key, len, freq);
      }
    }

  fclose (stream);
  return (0);
}

int
main (int argc, char **argv)
{
  int i, total = 0;

  if (argc != 3)
    {
      fprintf (stderr, "usage: %s [-nc] <input_name> <output_name>\n",
             argv[0]);
      return 1;
    }

  for (i = 0; i < MAX_PY_NUM; i++)
    {
      phtab[i] = NULL;
      phcount[i] = 0;
    }

  if(access("./pinyin.map", R_OK)==0)
        LoadTable ("./pinyin.map");
  else if(access("/usr/lib/unicon/modules/cce/dict/pinyin.map", R_OK)==0)
        LoadTable("/usr/lib/unicon/modules/cce/dict/pinyin.map");
  else if(access("/usr/local/lib/unicon/modules/cce/dict/pinyin.map", R_OK)==0)
        LoadTable("/usr/local/lib/unicon/modules/cce/dict/pinyin.map");
  else printf("Sorry, couldn't find pinyin.map!\n"), exit(-1);

  LoadPhraseFromFile (argv[1]);
  SavePhraseToFile (argv[2]);

  //statistic info
  for (i = 2; i <= MAX_PHRASE_LEN; i++)
    {
      printf ("Length=%d\tCount=%d\n", i, hzlen[i]);
      total += hzlen[i];
    }
  printf ("\nTotalPhrases = %d, MaxPhrasePerPinyin=%d\n\n", total, max_count);

  return 0;
}

Generated by  Doxygen 1.6.0   Back to index