UTF-8/UNICODE/简体中文/繁体中文之间的转换

UTF-8/UNICODE/简体中文/繁体中文之间的转换

简介

这几天一直在研究中文的简体和繁体之间的转换问题,网上查了一下资料,在此进行整理和备份。

繁体中文有GBK码和BIG5码两种编码,简体中文一般使用的是GB2312编码。

这些编码之间的转换基本都是使用下列3个函数:LCMapString、WideCharToMultiByte和MultiByteToWideChar,其中还会牵涉到UNICODE码和UTF-8码这两种编码。

GB2312编码与GBK编码可以直接使用LCMapString转换,GB2312编码/GBK编码与BIG5编码则无法直接转换,必须使用UNICODE作为中间编码进行中转。

另外UTF-8编码是网络常用编码,如XML文件和网页基本都是使用这种编码,所以在此也一并研究了一下。

下面是我将GB2312/GBK/BIG5/UNICODE/UTF-8这5种编码之间的转换编写到一个函数的代码。

代码

1 int Convert(void *sstr, int scp, void **dstr, int dcp)

2 {

3 #define CP_GBK 936

4 #define CP_BIG5 950

5 #define CP_UTF8 65001

6

7 enum { _unicode, _utf8, _gb2312, _gbk, _big5 };

8 enum { _wc2mb, _mb2wc, _sc2tc, _tc2sc };

9

10 LCID lcid;

11 void *src;

12 void *dest;

13 int cch;

14 int scp0;

15 int act;

16 UINT cp;

17

18 if (((scp < _unicode) || (scp > _big5)) ||

19 ((dcp < _unicode) || (dcp > _big5)))

20 return -1;

21

22 src = NULL;

23 dest = sstr;

24 cch = 0;

25 scp0 = scp;

26

27 while (scp != dcp)

28 {

29 src = dest;

30 switch (scp)

31 {

32 case _unicode:

33 switch (dcp)

34 {

35 case _utf8:

36 scp = _utf8;

37 act = _wc2mb;

38 cp = CP_UTF8;

39 break;

40 case _gb2312:

41 scp = ((scp0 == _big5) ? _gbk : _gb2312);

42 act = _wc2mb;

43 cp = CP_GBK;

44 break;

45 case _gbk:

46 scp = _gbk;

47 act = _wc2mb;

48 cp = CP_GBK;

49 break;

50 case _big5:

51 scp = _big5;

52 act = _wc2mb;

53 cp = CP_BIG5;

54 break;

55 }

56 break;

57 case _utf8:

58 switch (dcp)

59 {

60 case _unicode:

61 case _gb2312:

62 case _gbk:

63 case _big5:

64 scp = _unicode;

65 act = _mb2wc;

66 cp = CP_UTF8;

67 break;

68 }

69 break;

70 case _gb2312:

71 switch (dcp)

72 {

73 case _unicode:

74 case _utf8:

75 scp = _unicode;

76 act = _mb2wc;

77 cp = CP_GBK;

78 break;

79 case _gbk:

80 case _big5:

81 scp = _gbk;

82 act = _sc2tc;

83 break;

84 }

85 break;

86 case _gbk:

87 switch (dcp)

88 {

89 case _unicode:

90 case _utf8:

91 case _big5:

92 scp = _unicode;

93 act = _mb2wc;

94 cp = CP_GBK;

95 break;

96 case _gb2312:

97 scp = _gb2312;

98 act = _tc2sc;

99 break;

100 }

101 break;

102 case _big5:

103 switch (dcp)

104 {

105 case _unicode:

106 case _utf8:

107 case _gb2312:

108 case _gbk:

109 scp = _unicode;

110 act = _mb2wc;

111 cp = CP_BIG5;

112 break;

113 }

114 break;

115 }

116

117 switch (act)

118 {

119 case _wc2mb:

120 cch = WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, NULL, 0, NULL, NULL);

121 dest = malloc(cch * sizeof(char));

122 WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, (char *)dest, cch, NULL, NULL);

123 break;

124 case _mb2wc:

125 cch = MultiByteToWideChar(cp, 0, (char *)src, -1, NULL, 0);

126 dest = malloc(cch * sizeof(wchar_t));

127 MultiByteToWideChar(cp, 0, (char *)src, -1, (wchar_t *)dest, cch);

128 break;

129 case _sc2tc:

130 lcid = GetSystemDefaultLCID();

131 cch = LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, NULL, 0);

132 dest = malloc(cch * sizeof(char));

133 LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, (char *)dest, cch);

134 break;

135 case _tc2sc:

136 lcid = GetSystemDefaultLCID();

137 cch = LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, NULL, 0);

138 dest = malloc(cch * sizeof(char));

139 LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, (char *)dest, cch);

140 break;

141 }

142

143 if (src && (src != sstr))

144 {

145 free(src);

146 }

147 }

148

149 if (dstr)

150 {

151 *dstr = dest;

152 }

153 else

154 {

155 free(dest);

156 }

157

158 return cch;

159 }

参数说明

sstr:[in]源字符串的首地址,由于可能是char *和wchar_t *两种数据类型,所以这里我设置为了void *类型 scp:[in]源字符串的编码方式,0:UNICODE编码、1:UTF-8编码、2:GB2312编码、3:GBK编码、4:BIG5编码 dstr:[out]目标字符串地址的指针,由于可能是char **和wchar_t **两种数据类型,所以这里我设置为了void **类型 dcp:[in]目标字符串的编码方式,取值范围与scp类似

函数使用

由于编码方式比较难记忆,所以我将任意两种编码的转化进行了如下定义

#define UnicodeToUtf8(src, dest) Convert((void *)(src), 0, (void **)(dest), 1)

#define UnicodeToGb2312(src, dest) Convert((void *)(src), 0, (void **)(dest), 2)

#define UnicodeToGbk(src, dest) Convert((void *)(src), 0, (void **)(dest), 3)

#define UnicodeToBig5(src, dest) Convert((void *)(src), 0, (void **)(dest), 4)

#define Utf8ToUnicode(src, dest) Convert((void *)(src), 1, (void **)(dest), 0)

#define Utf8ToGb2312(src, dest) Convert((void *)(src), 1, (void **)(dest), 2)

#define Utf8ToGbk(src, dest) Convert((void *)(src), 1, (void **)(dest), 3)

#define Utf8ToBig5(src, dest) Convert((void *)(src), 1, (void **)(dest), 4)

#define Gb2312ToUnicode(src, dest) Convert((void *)(src), 2, (void **)(dest), 0)

#define Gb2312ToUtf8(src, dest) Convert((void *)(src), 2, (void **)(dest), 1)

#define Gb2312ToGbk(src, dest) Convert((void *)(src), 2, (void **)(dest), 3)

#define Gb2312ToBig5(src, dest) Convert((void *)(src), 2, (void **)(dest), 4)

#define GbkToUnicode(src, dest) Convert((void *)(src), 3, (void **)(dest), 0)

#define GbkToUtf8(src, dest) Convert((void *)(src), 3, (void **)(dest), 1)

#define GbkToGb2312(src, dest) Convert((void *)(src), 3, (void **)(dest), 2)

#define GbkToBig5(src, dest) Convert((void *)(src), 3, (void **)(dest), 4)

#define Big5ToUnicode(src, dest) Convert((void *)(src), 4, (void **)(dest), 0)

#define Big5ToUtf8(src, dest) Convert((void *)(src), 4, (void **)(dest), 1)

#define Big5ToGb2312(src, dest) Convert((void *)(src), 4, (void **)(dest), 2)

#define Big5ToGbk(src, dest) Convert((void *)(src), 4, (void **)(dest), 3)

测试代码如下:

1 void main()

2 {

3 char *p0;

4 char *p1;

5

6 Gb2312ToBig5("中华人民共和国", &p0);

7 printf("%s\n", p0);

8 Big5ToGb2312(p0, &p1);

9 printf("%s\n", p1);

10

11 free(p0);

12 free(p1);

13 }

相关推荐

魅族手机如何强制关机
365下载bet

魅族手机如何强制关机

🕒 09-27 👁️ 4756
2015年世界杯男子乒乓球赛在瑞典哈尔姆斯塔德举行
正规365没有黑钱

2015年世界杯男子乒乓球赛在瑞典哈尔姆斯塔德举行

🕒 01-12 👁️ 8364