简介
这几天一直在研究中文的简体和繁体之间的转换问题,网上查了一下资料,在此进行整理和备份。
繁体中文有GBK码和BIG5码两种编码,简体中文一般使用的是GB2312编码。
这些编码之间的转换基本都是使用下列3个函数:LCMapString、WideCharToMultiByte和MultiByteToWideChar,其中还会牵涉到UNICODE码和UTF-8码这两种编码。
GB2312编码与GBK编码可以直接使用LCMapString转换,GB2312编码/GBK编码与BIG5编码则无法直接转换,必须使用UNICODE作为中间编码进行中转。
另外UTF-8编码是网络常用编码,如XML文件和网页基本都是使用这种编码,所以在此也一并研究了一下。
下面是我将GB2312/GBK/BIG5/UNICODE/UTF-8这5种编码之间的转换编写到一个函数的代码。
代码
1 int Convert(void *sstr, int scp, void **dstr, int dcp)
2 {
3 #define CP_GBK 936
4 #define CP_BIG5 950
5 #define CP_UTF8 65001
6
7 enum { _unicode, _utf8, _gb2312, _gbk, _big5 };
8 enum { _wc2mb, _mb2wc, _sc2tc, _tc2sc };
9
10 LCID lcid;
11 void *src;
12 void *dest;
13 int cch;
14 int scp0;
15 int act;
16 UINT cp;
17
18 if (((scp < _unicode) || (scp > _big5)) ||
19 ((dcp < _unicode) || (dcp > _big5)))
20 return -1;
21
22 src = NULL;
23 dest = sstr;
24 cch = 0;
25 scp0 = scp;
26
27 while (scp != dcp)
28 {
29 src = dest;
30 switch (scp)
31 {
32 case _unicode:
33 switch (dcp)
34 {
35 case _utf8:
36 scp = _utf8;
37 act = _wc2mb;
38 cp = CP_UTF8;
39 break;
40 case _gb2312:
41 scp = ((scp0 == _big5) ? _gbk : _gb2312);
42 act = _wc2mb;
43 cp = CP_GBK;
44 break;
45 case _gbk:
46 scp = _gbk;
47 act = _wc2mb;
48 cp = CP_GBK;
49 break;
50 case _big5:
51 scp = _big5;
52 act = _wc2mb;
53 cp = CP_BIG5;
54 break;
55 }
56 break;
57 case _utf8:
58 switch (dcp)
59 {
60 case _unicode:
61 case _gb2312:
62 case _gbk:
63 case _big5:
64 scp = _unicode;
65 act = _mb2wc;
66 cp = CP_UTF8;
67 break;
68 }
69 break;
70 case _gb2312:
71 switch (dcp)
72 {
73 case _unicode:
74 case _utf8:
75 scp = _unicode;
76 act = _mb2wc;
77 cp = CP_GBK;
78 break;
79 case _gbk:
80 case _big5:
81 scp = _gbk;
82 act = _sc2tc;
83 break;
84 }
85 break;
86 case _gbk:
87 switch (dcp)
88 {
89 case _unicode:
90 case _utf8:
91 case _big5:
92 scp = _unicode;
93 act = _mb2wc;
94 cp = CP_GBK;
95 break;
96 case _gb2312:
97 scp = _gb2312;
98 act = _tc2sc;
99 break;
100 }
101 break;
102 case _big5:
103 switch (dcp)
104 {
105 case _unicode:
106 case _utf8:
107 case _gb2312:
108 case _gbk:
109 scp = _unicode;
110 act = _mb2wc;
111 cp = CP_BIG5;
112 break;
113 }
114 break;
115 }
116
117 switch (act)
118 {
119 case _wc2mb:
120 cch = WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, NULL, 0, NULL, NULL);
121 dest = malloc(cch * sizeof(char));
122 WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, (char *)dest, cch, NULL, NULL);
123 break;
124 case _mb2wc:
125 cch = MultiByteToWideChar(cp, 0, (char *)src, -1, NULL, 0);
126 dest = malloc(cch * sizeof(wchar_t));
127 MultiByteToWideChar(cp, 0, (char *)src, -1, (wchar_t *)dest, cch);
128 break;
129 case _sc2tc:
130 lcid = GetSystemDefaultLCID();
131 cch = LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, NULL, 0);
132 dest = malloc(cch * sizeof(char));
133 LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, (char *)dest, cch);
134 break;
135 case _tc2sc:
136 lcid = GetSystemDefaultLCID();
137 cch = LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, NULL, 0);
138 dest = malloc(cch * sizeof(char));
139 LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, (char *)dest, cch);
140 break;
141 }
142
143 if (src && (src != sstr))
144 {
145 free(src);
146 }
147 }
148
149 if (dstr)
150 {
151 *dstr = dest;
152 }
153 else
154 {
155 free(dest);
156 }
157
158 return cch;
159 }
参数说明
sstr:[in]源字符串的首地址,由于可能是char *和wchar_t *两种数据类型,所以这里我设置为了void *类型 scp:[in]源字符串的编码方式,0:UNICODE编码、1:UTF-8编码、2:GB2312编码、3:GBK编码、4:BIG5编码 dstr:[out]目标字符串地址的指针,由于可能是char **和wchar_t **两种数据类型,所以这里我设置为了void **类型 dcp:[in]目标字符串的编码方式,取值范围与scp类似
函数使用
由于编码方式比较难记忆,所以我将任意两种编码的转化进行了如下定义
#define UnicodeToUtf8(src, dest) Convert((void *)(src), 0, (void **)(dest), 1)
#define UnicodeToGb2312(src, dest) Convert((void *)(src), 0, (void **)(dest), 2)
#define UnicodeToGbk(src, dest) Convert((void *)(src), 0, (void **)(dest), 3)
#define UnicodeToBig5(src, dest) Convert((void *)(src), 0, (void **)(dest), 4)
#define Utf8ToUnicode(src, dest) Convert((void *)(src), 1, (void **)(dest), 0)
#define Utf8ToGb2312(src, dest) Convert((void *)(src), 1, (void **)(dest), 2)
#define Utf8ToGbk(src, dest) Convert((void *)(src), 1, (void **)(dest), 3)
#define Utf8ToBig5(src, dest) Convert((void *)(src), 1, (void **)(dest), 4)
#define Gb2312ToUnicode(src, dest) Convert((void *)(src), 2, (void **)(dest), 0)
#define Gb2312ToUtf8(src, dest) Convert((void *)(src), 2, (void **)(dest), 1)
#define Gb2312ToGbk(src, dest) Convert((void *)(src), 2, (void **)(dest), 3)
#define Gb2312ToBig5(src, dest) Convert((void *)(src), 2, (void **)(dest), 4)
#define GbkToUnicode(src, dest) Convert((void *)(src), 3, (void **)(dest), 0)
#define GbkToUtf8(src, dest) Convert((void *)(src), 3, (void **)(dest), 1)
#define GbkToGb2312(src, dest) Convert((void *)(src), 3, (void **)(dest), 2)
#define GbkToBig5(src, dest) Convert((void *)(src), 3, (void **)(dest), 4)
#define Big5ToUnicode(src, dest) Convert((void *)(src), 4, (void **)(dest), 0)
#define Big5ToUtf8(src, dest) Convert((void *)(src), 4, (void **)(dest), 1)
#define Big5ToGb2312(src, dest) Convert((void *)(src), 4, (void **)(dest), 2)
#define Big5ToGbk(src, dest) Convert((void *)(src), 4, (void **)(dest), 3)
测试代码如下:
1 void main()
2 {
3 char *p0;
4 char *p1;
5
6 Gb2312ToBig5("中华人民共和国", &p0);
7 printf("%s\n", p0);
8 Big5ToGb2312(p0, &p1);
9 printf("%s\n", p1);
10
11 free(p0);
12 free(p1);
13 }