读书人

CharTokenizer的容易学习

发布时间: 2013-08-01 15:23:18 作者: rapoo

CharTokenizer的简单学习

一、关系

AttributeSource→TokenStream→Tokenizer??

???????????????????????????????????? ↓

?????????????????????????????? TokenFilter

=============================================

Analyzer中的一个抽象方法是

//属性private final ReuseStrategy reuseStrategy;========================================//TokenStreamComponents//保存了tokenizer和tokeniStream//也可以设置Reader  protected abstract TokenStreamComponents createComponents(String fieldName,Reader reader);========================================//得到TokenStreampublic final TokenStream tokenStream(final String fieldName,                                       final Reader reader) throws IOException {//ReuseStrategy这个内部类是干吗的?// private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();//内部抽象类 GlobalReuseStrategy 存放:TokenStreamComponents //               PerFieldReuseStrategy存放 Map<String, TokenStreamComponents>           private final ReuseStrategy reuseStrategy;    TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);    final Reader r = initReader(fieldName, reader);    if (components == null) {      components = createComponents(fieldName, r);      reuseStrategy.setReusableComponents(fieldName, components);    } else {      components.setReader(r);    }    return components.getTokenStream();  }

?分词输出例子:

?

Analyzer a=new WhitespaceAnalyzer(Version.LUCENE_43);TokenStream tokenStream=a.tokenStream("CESHI", new StringReader("I LOVE YOU!")); CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();  //java.lang.ArrayIndexOutOfBoundsExceptionwhile(tokenStream.incrementToken()){    System.out.print("["+termAttribute.toString()+"  }

?

?

?

二、TokenStream的一些方法和属性

//对于Reader的解析,Token的不断输出public abstract boolean incrementToken() throws IOException;public void reset() throws IOException {}

?

?

三、Tokenizer的属性和方法

//声明Tokenizer的时候必须有ReaderReader

?四、CharTokenizer

?

public abstract class CharTokenizer extends Tokenizer {  //tokenizer的属性Reader  public CharTokenizer(Version matchVersion, Reader input) {    super(input);    charUtils = CharacterUtils.getInstance(matchVersion);  }    public CharTokenizer(Version matchVersion, AttributeFactory factory,      Reader input) {    super(factory, input);    charUtils = CharacterUtils.getInstance(matchVersion);  }    // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()  //用这些参数的时候必须reset()下 把bufferIndex=0  //因为第一次处理的时候       if (bufferIndex >= dataLen) 不然reader充值不进来?  private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;  private static final int MAX_WORD_LEN = 255; //允许单词的最大长度  private static final int IO_BUFFER_SIZE = 4096;//一次允许的最大的字符数    //添加一些attribute  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);    //CharacterUtils的方法  //codePointAt fill  //通过 Character得到该类提供了几种方法,以确定字符的类别(小写字母,数字,等等),并将字符从大写转换成小写,反之亦然  private final CharacterUtils charUtils;    //CharacterBuffer的属性  //char[] buffer; int offset; int length;  private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);    //判断是不是token  protected abstract boolean isTokenChar(int c);  //当看到小写处理的时候是处理掉了 转换为小写了。  protected int normalize(int c) {    return c;  }    @Override  public final boolean incrementToken() throws IOException {  //这个处理是attributeSource处理的具体纳特state没看懂?    clearAttributes();        int length = 0;    int start = -1; // this variable is always initialized    int end = -1;    char[] buffer = termAtt.buffer();    //循环开始??    //offset的明白了一点,但是termAtt怎么得到字符的哪?又是怎么得到小写字符的哪?    while (true) {    //把tokenizer的reader的值赋值到ioBuffer里      if (bufferIndex >= dataLen) {        offset += dataLen;        //tokenizer有reader参数        //实例化analyzer必须实现的方法返回TokenStreamComponents 这个类实现需要tokenizer 属性reader,TokenStream        //把输入流填充到ioBuffer中        if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils          dataLen = 0; // so next offset += dataLen won't decrement offset          if (length > 0) {            break;          } else {            finalOffset = correctOffset(offset);            return false;          }        }        //赋值成功的话datLen会得到数据长度        dataLen = ioBuffer.getLength();        bufferIndex = 0;      }     //赋值成功后判断偏移量的字符  返回给定索引上的 Unicode 代码点      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);     //确定表示指定字符(Unicode 代码点)所需的 char 值的数量 具体也不清楚      final int charCount = Character.charCount(c);      bufferIndex += charCount;      //WhitespaceTokenizer 判断是否是空格      //如果length>0也跳出了循环      if (isTokenChar(c)) {               // if it's a token char          //如果length==0 / start      if (length == 0) {                // start of token          assert start == -1;                    start = offset + bufferIndex - charCount;          end = start;        } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds          buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer        }        end += charCount;        length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized        if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test          break;      } else if (length > 0)             // at non-Letter w/ chars              break;                           // return 'em    }    termAtt.setLength(length);    assert start != -1;   offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));    return true;      }    @Override  public final void end() {    offsetAtt.setOffset(finalOffset, finalOffset);  }//重置属性  @Override  public void reset() throws IOException {    bufferIndex = 0;    offset = 0;    dataLen = 0;    finalOffset = 0;    ioBuffer.reset(); // make sure to reset the IO buffer!!  }}

?

?

?

?

?

?

?

?

读书人网 >开源软件

热点推荐