构建GB2312汉字库的unicode码表

构建GB2312汉字库的unicode码表2010-12-22 vckbase 乾坤一笑构建 GB2312 汉字库的 unicode 码表嵌入式系统总离不了处理汉字。一般汉字的处理方法是（以手机接受短信为例）：比如你收到了一封短信，该短信解码后是按照 UTF-16 表示的，那么我们需要根据每一个汉字的unicode 码找到它在 GB2312 库中的位置，然后再用对应的点阵数据在屏幕上显示出来。

于是乎，必须有一种手段将 unicode 码和汉字字模的数据对应起来。最常用的手段是做一个 unicode 码表，在该数组中查找到匹配的 unicode 码后，用匹配的 index（数组索引）值在另外一个由该 index 值对应的字模记录的数组中的数据去显示。

+-----------------+ 查表 +-----------------+ 同index + -------------------+

+-------- ---------+ +-----------------+ +-------------------+

本文简要介绍一下如何生成 unicode 码表，其它相关的汉字处理技术不在本文的讨论范围之内。:）

用下面两个函数可以把 unicode 码表构造出来（*注1）：

void UnicodeToGB2312（unsigned char* pOut,unsigned short uData） 
{
   WideCharToMultiByte（CP_ACP,NULL,&uData,1,pOut,sizeof （unsigned short）,NULL,NULL）;
   return;
}
void Gb2312ToUnicode（unsigned short* pOut,unsigned char *gbBuffer）
{
   MultiByteToWideChar （CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1）;
   return;
}

一个简单的例子如下（随手写的一段代码，只是演示一下构造数组的过程，不要挑刺儿啊！ ^_^ ）：

/*-----------------------------------------------*|GB2312 unicode table constructor ||author: Spark Song ||file: build_uni_table.c||date: 2005-11-18 |*-----------------------------------------------*/#include <stdio.h>#include <windows.h>void UnicodeToGB2312（unsigned char* pOut,unsigned short uData）;void Gb2312ToUnicode（unsigned short* pOut,unsigned char *gbBuffer）;void construct_unicode_table（）;int main（int argc, char *argv[]）{construct_unicode_table（）;return 0;}void construct_unicode_table（）{#define GB2312_MATRIX （94）#define DELTA （0xA0）#define FONT_ROW_BEGIN （16+ DELTA）#define FONT_ROW_END （87 + DELTA）#define FONT_COL_BEGIN （1+ DELTA）#define FONT_COL_END （GB2312_MATRIX + DELTA）#define FONT_TOTAL （72 * GB2312_MATRIX）int i, j;unsigned char chr[2];unsigned shortuni;unsigned shortdata[FONT_TOTAL] = {0};int index = 0;unsigned short buf;//生成unicode码表for （i=FONT_ROW_BEGIN; i<=FONT_ROW_END; i++）for（j=FONT_COL_BEGIN; j<=FONT_COL_END; j++）{chr[0] = i;chr[1] = j;Gb2312ToUnicode（&uni, chr）;data[index] = uni; index++;} //排个序，以后检索的时候就可以用binary-search了for （i=0;i<index-1; i++）for（j=i+1; j<index; j++）if （data[i]>data[j]）{buf = data[i];data[i] = data[j];data[j] = buf;}//输出到STD_OUTprintf（"const unsigned short uni_table[]={
"）;for （i=0; i<index; i++）{uni = data[i];UnicodeToGB2312（chr, uni）;printf（"0x%.4X%s /* GB2312 Code: 0x%.2X%.2X ==> Row:%.2d Col:%.2d */
",uni,i==index-1？" ":",",chr[0],chr[1],chr[0] - DELTA,chr[1] - DELTA）;}printf（"};
"）;return ;}void UnicodeToGB2312（unsigned char* pOut,unsigned short uData）{WideCharToMultiByte（CP_ACP,NULL,&uData,1,pOut,sizeof（unsigned short）,NULL,NULL）;return;}void Gb2312ToUnicode（unsigned short* pOut,unsigned char *gbBuffer）{MultiByteToWideChar（CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1）;return;}

用 VC 编译后，在 DOS 中执行：

build_uni_table.exe > report.txt

可以得到如下的txt文件：

const unsigned short uni_table[]={
  0x4E00, /* GB2312 Code: 0xD2BB ==> Row:50 Col:27 */
  0x4E01, /* GB2312 Code: 0xB6A1 ==> Row:22 Col:01 */
  0x4E03, /* GB2312 Code: 0xC6DF ==> Row:38 Col:63 */
  0x4E07, /* GB2312 Code: 0xCDF2 ==> Row:45 Col:82 */
... ...
  0x9F9F, /* GB2312 Code: 0xB9EA ==> Row:25 Col:74 */
  0x9FA0, /* GB2312 Code: 0xD9DF ==> Row:57 Col:63 */
  0xE810, /* GB2312 Code: 0xD7FA ==> Row:55 Col:90 */
  0xE811, /* GB2312 Code: 0xD7FB ==> Row:55 Col:91 */
  0xE812, /* GB2312 Code: 0xD7FC ==> Row:55 Col:92 */
  0xE813, /* GB2312 Code: 0xD7FD ==> Row:55 Col:93 */
  0xE814 /* GB2312 Code: 0xD7FE ==> Row:55 Col:94 */
};

然后把这个生成的数组copy到项目代码中使用就okey了。hoho，其实在开发中编写代码来构造代码的机会很多，coder不用coding辅助自己开发多浪费啊～ :）