iconv clucene

2023-11-10 16:48
文章标签 iconv clucene

本文主要是介绍iconv clucene,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!

/
/// 此内容摘自 linux 上 iconv 命令程序代码,目的在于处理转码出现无效字符的情况

struct iconv_hooks {};
struct iconv_fallbacks {};
typedef unsigned int ucs4_t;
typedef struct conv_struct * conv_t;
struct loop_funcs {
size_t (*loop_convert) (iconv_t icd,
const char* * inbuf, size_t *inbytesleft,
char* * outbuf, size_t *outbytesleft);
size_t (*loop_reset) (iconv_t icd,
char* * outbuf, size_t *outbytesleft);
};
struct mbtowc_funcs {
int (*xxx_mbtowc) (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n);
/*
* int xxx_mbtowc (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n)
* converts the byte sequence starting at s to a wide character. Up to n bytes
* are available at s. n is >= 1.
* Result is number of bytes consumed (if a wide character was read),
* or -1 if invalid, or -2 if n too small, or -2-(number of bytes consumed)
* if only a shift sequence was read.
*/
int (*xxx_flushwc) (conv_t conv, ucs4_t *pwc);
/*
* int xxx_flushwc (conv_t conv, ucs4_t *pwc)
* returns to the initial state and stores the pending wide character, if any.
* Result is 1 (if a wide character was read) or 0 if none was pending.
*/
};
struct wctomb_funcs {
int (*xxx_wctomb) (conv_t conv, unsigned char *r, ucs4_t wc, int n);
/*
* int xxx_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
* converts the wide character wc to the character set xxx, and stores the
* result beginning at r. Up to n bytes may be written at r. n is >= 1.
* Result is number of bytes written, or -1 if invalid, or -2 if n too small.
*/
int (*xxx_reset) (conv_t conv, unsigned char *r, int n);
/*
* int xxx_reset (conv_t conv, unsigned char *r, int n)
* stores a shift sequences returning to the initial state beginning at r.
* Up to n bytes may be written at r. n is >= 0.
* Result is number of bytes written, or -2 if n too small.
*/
};
typedef unsigned int state_t;
struct conv_struct {
struct loop_funcs lfuncs;
/* Input (conversion multibyte -> unicode) */
int iindex;
struct mbtowc_funcs ifuncs;
state_t istate;
/* Output (conversion unicode -> multibyte) */
int oindex;
struct wctomb_funcs ofuncs;
int oflags;
state_t ostate;
/* Operation flags */
int transliterate;
int discard_ilseq;
#ifndef LIBICONV_PLUG
struct iconv_fallbacks fallbacks;
struct iconv_hooks hooks;
#endif
};


/// 转载结束


int __charcode_convert__(LPCSTR from, LPCSTR to, LPSTR save, int savelen, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
iconv_t cd;
char *inbuf = src;
char *outbuf = save;
size_t outbufsize = savelen;
int status = 0;
size_t savesize = 0;
size_t inbufsize = srclen;
const char* inptr = inbuf;
size_t insize = inbufsize;
char* outptr = outbuf;
size_t outsize = outbufsize;

if (!ignore_invalid_sequence)
cd = iconv_open(to, from);
else
{
char tochartset[64]={0};
sprintf(tochartset, "%s//IGNORE", to);
cd = iconv_open(tochartset, from);
}

if (cd == (iconv_t)(-1))
{
printf("iconv_open oper error!\n");
status = -1;
goto done;
}

iconv(cd, NULL, NULL, NULL, NULL);
if (inbufsize == 0)
{
status = -1;
goto done;
}

int invaild_do;
invaild_do = 0;

while (insize > 0)
{
size_t res = iconv(cd, (char**)&inptr, &insize, &outptr, &outsize);
if (res == (size_t)(-1))
{
if (errno == EILSEQ)
{
if (invaild_do == 0)
{
((conv_t)cd)->discard_ilseq = 1;
invaild_do = 1;
continue;
}

status = -3;
goto done;
}
else if (errno == EINVAL)
{
if (inbufsize == 0)
{
status = -4;
goto done;
}
else
{
break;
}
}
else if (errno == E2BIG)
{
status = -5;
goto done;
}
else
{
status = -6;
goto done;
}
}

invaild_do = 0;

if (outptr != outbuf)
{
int saved_errno = errno;
int outsize = outptr - outbuf;
strncpy(save+savesize, outbuf, outsize);
errno = saved_errno;
}

lj_sleep(0, 1);
}

status = strlen(save);
status = status > 0 ? 0 : -1;

done:
iconv_close(cd);
return status;
}

char *charcode_convert(LPCSTR from, LPCSTR to, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
char *outbuf = (char*)malloc(4*srclen+sizeof(char));
memset(outbuf, 0, 4*srclen+sizeof(char));
if (__charcode_convert__(from, to, outbuf, 4*srclen, src, srclen, ignore_invalid_sequence) != 0)
{
LJFREE(outbuf);
outbuf = (char*)malloc(sizeof(char));
memset(outbuf, 0, sizeof(char));
}
return outbuf;
}

char* utf8_to_chna(char *utf8buf, bool ignore_invalid_sequence, LPCSTR to_chna_charset)
{
return charcode_convert("UTF-8", to_chna_charset, utf8buf, strlen(utf8buf), ignore_invalid_sequence);
}

char* chna_to_utf8(char *chnabuf, bool ignore_invalid_sequence, LPCSTR frm_chna_charset)
{
return charcode_convert(frm_chna_charset, "UTF-8", chnabuf, strlen(chnabuf), ignore_invalid_sequence);
}

这篇关于iconv clucene的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!



http://www.chinasem.cn/article/383983

相关文章

cocos2d_x使用iconv库转码解决中文显示问题

关于cocos2dx中文显示,流行有三种解决方案。      1、客户端全部使用UTF-8文件。       这种办法对于写在客户端上的文字还是挺有用的,但对于服务端发送过来的中文就显示乱码了,但如果是单机还是挺推荐的。可以用宏定义的方法将固定的内容定义在某个文件,如果想发展国外市场版本,可以直接拿这个文件给翻译。      2、使用iconv库。      3、使用xml文件读取。      这

clucene demo编译

之前提到的是doris的third-party编译,现在需要编译clucene的demo 首先,再build文件夹里 cmake … 这个通过cmake生成了make的原材料。 然后,在make前,export CC / CXX指定编译器。(cmakecache)在这一步形成。 (ps 这里一定要 一定要指定好编译器版本,不然看似编译起来了,可能不能链接。) (注意 有的编译器版本没法链接动态库)

使用node中的iconv-lite实现对“gbk”格式的转码

在window中,gbk和utf-8是最常见的两种格式,但是我们在显示的时候往往需要将GBK转换为UTF-8,我现在有一个同步读取文件的操作: const fs = require('fs');const path = require('path');const buffer = fs.readFileSync(path.join(__dirname, '../lyrics/友谊之光.lrc'

Linux 文件管理命令 tr col colrm fold iconv

文章目录 2.Linux 文件管理命令2.49 tr:转换字符案例练习 2.50 col:过滤控制字符案例练习 2.51 colrm:删除指定的行案例练习 2.52 fold:限制文件列宽案例练习 2.53 iconv:转换给定文件的编码案例练习 2.Linux 文件管理命令 2.49 tr:转换字符 作用:从标准输入设备读取数据,经过字符串转义后,输出到标准输出设备。

android 添加 iconv 支持

NDK自带的iconv的 查看 android-ndk-r9d/sources/android/support/include/iconv.h 在Android.mk中加入 LOCAL_WHOLE_STATIC_LIBRARIES += android_support $(call import-module,android/support) 如: LOCAL_PATH

iconv 使用

iconv -f ASCII -t Utf-8 ddd.txt > mm.txt 把编码ASCII格式的ddd.txt文件转码为utf-8,并且存储为mm.txt文件。

DOCKER 阿里云 OSS iconv()

环境介绍: docker镜像、alpine系统、hyperf框架、使用阿里云Oss的sdk上传文件 报错记录: PHP Notice: iconv(): Wrong charset, conversion from `GBK' to `UTF-8//IGNORE' is not allowed in /opt/www/vendor/aliyuncs/oss-sdk-php/src/OSS/

PHP iconv()字符编码转换的问题

在php函数库有一个函数:iconv(),iconv函数库能够完成各种字符集间的转换,是php编程中不可缺少的基础函数库。 最近在做一个小偷程序,需要用到iconv函数把抓取来过的utf-8编码的页面转成gb2312, 发现只有用iconv函数把抓取过来的数据一转码数据就会无缘无故的少一些。 让我郁闷了好一会儿,去网上一查资料才知道这是iconv函数的一个bug。iconv在转换字符”—”到gb

iconv函数提示错误,解决!

在网上看到json的文章,打算测试一下,结果,出现错误,先附上代码:   [php]  view plain copy <?<a href="http://lib.csdn.net/base/php" class='replace_word' title="PHP知识库" target='_blank' style='color:#df3434; font-weight:b

PHP 解决采集乱码问题mb_convert_encoding和iconv使用比较 by cubeking

mb_convert_encoding的用法见官方: http://cn.php.net/manual/zh/function.mb-convert-encoding.php 做一个GBK To UTF-8 < ?php header("content-Type: text/html; charset=Utf-8"); echo mb_convert_encoding("妳係我的友仔",