clucene 读源码记录
0\ iconv 的使用http://www.gnu.org/software/libc/manual/html_node/iconv-Examples.html#iconv-Examples
1\ TestUtf8 中的 测试把GBK中的数据转换为unicode,然后索引
void _Index(CuTest *tc, IndexWriter* ndx,char* file){ char path[CL_MAX_PATH]; TCHAR tlang[20]; strcpy(path,clucene_data_location);strcat(path,"/utf8text");CuAssert(tc,_T("Utf8 directory does not exist"),Misc::dir_Exists(path));strcat(path,"/");strcat(path,file);/*strcat(path,"_utf8.txt");*/strcat(path,"_gbk.txt");CuAssert(tc,_T("Language file does not exist"),Misc::dir_Exists(path));STRCPY_AtoT(tlang,file,CL_MAX_PATH); Document doc;doc.add(*Field::Keyword(_T("language"),tlang)); jstreams::FileReader* fr = new jstreams::FileReader(path, "GBK"); // StandardAnalyzer analyzer; // TokenStream* stream = analyzer.tokenStream(NULL, _CLNEW CL_NS(util)::Reader(fr,true));doc.add(*Field::Text(_T("contents"), _CLNEW CL_NS(util)::Reader(fr,true) ));ndx->addDocument(&doc); }
2\ clucene 中util 库代码中的FileInputStream,一个从文件中读出数据填充到缓存中方法
int32_t FileInputStream::fillBuffer(char* start, int32_t space) { if (file == 0) return -1; // read into the buffer int32_t nwritten = fread(start, 1, space, file); // check the file stream status if (ferror(file)) { error = "Could not read from file '" + filepath + "'."; fclose(file); file = 0; status = Error; return -1; } if (feof(file)) { fclose(file); file = 0; } return nwritten;}
3\
有关的
template <class T> template <class char>void InputStreamBuffer<T>::setSize(int32_t size) { // store pointer information int32_t offset = (int32_t)(readPos - start);// allocate memory in the buffer if ( start == 0 )start = (T*)malloc(size*sizeof(T));elsestart = (T*)realloc(start, size*sizeof(T)); this->size = size; // restore pointer information readPos = start + offset;}
4\GBK转UCS-2码,然后索引
iconv_t converter = iconv_open("UCS-2-INTERNAL", "GBK");//iconv_t converter = iconv_open("UCS-2-INTERNAL", "UTF-8");// iconv_t converter = iconv_open("UCS-2-INTERNAL", "ASCII"); const char *inbuf ="我喜欢你欧阳"; size_t inbytesleft = strlen(inbuf); wchar_t* start=(wchar_t*)malloc(inbytesleft*sizeof(wchar_t)); memset(start,0,inbytesleft*sizeof(wchar_t)); size_t outbytesleft = sizeof(wchar_t)*inbytesleft; char *outbuf = (char*)start; size_t r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft); iconv_close(converter); doc.add(*Field::Text(_T("contents"),start));
4\从gb2312转 utf8 ,再从utf8转 ucs-2
//代码转换:从一种编码转为另一种编码int code_convert(char *from_charset,char *to_charset,const char *inbuf,size_t inlen,char *outbuf,size_t outlen){iconv_t cd;int rc;const char **pin = &inbuf;char **pout = &outbuf;cd = iconv_open(to_charset,from_charset);if (cd==0) return -1;memset(outbuf,0,outlen);if (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;iconv_close(cd);return 0;}//UNICODE码转为GB2312码int u2g(char *inbuf,size_t inlen,char *outbuf,size_t outlen){return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);}//GB2312码转为UNICODE码int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen){return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);} iconv_t converter = iconv_open("UCS-2-INTERNAL", "UTF-8"); //gb2312码转为unicode码 char *in_gb2312="我喜欢你欧阳"; size_t tmpleng = strlen(in_gb2312);// char out[500]; char* out=(char*)malloc(3*tmpleng*sizeof(char)); int rec = g2u(in_gb2312,strlen(in_gb2312),out,3*tmpleng); const char * inbuf=out; size_t inbytesleft = strlen(inbuf); wchar_t* start=(wchar_t*)malloc(inbytesleft*sizeof(wchar_t)); memset(start,0,inbytesleft*sizeof(wchar_t)); size_t outbytesleft = sizeof(wchar_t)*inbytesleft; char *outbuf = (char*)start; size_t r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft); iconv_close(converter); doc.add(*Field::Text(_T("contents"),start));