乡下人产国偷v产偷v自拍,国产午夜片在线观看,婷婷成人亚洲综合国产麻豆,久久综合给合久久狠狠狠9

  • <output id="e9wm2"></output>
    <s id="e9wm2"><nobr id="e9wm2"><ins id="e9wm2"></ins></nobr></s>

    • 分享

      基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞

       richsky 2012-04-20
      基于詞典的正向最大匹配中文分詞算法,能實現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

      第一次寫中文分詞程序,歡迎拍磚。

      public class MM2 
      {
          
      private static final Log log = LogFactory.getLog(MM2.class);
          
          
      private static HashMap<String, Integer> dictionary = null
          
      private static final int WORD_MAX_LENGTH = 9;
          
      private Reader reader;
          
          
      static
          
      {
              loadDictionary();
          }

          
          
      public MM2(Reader reader) 
          

              
      this.reader = reader; 
          }
       
          
          
      //切分出由中文、字母、數(shù)字組成的句子
          public ArrayList<Sentence> getSentence() throws IOException
          
      {   
              ArrayList
      <Sentence> list=new ArrayList<Sentence>();
              StringBuffer cb
      =new StringBuffer();
              
      int d=reader.read();
              
      int offset=0;
              
      boolean b=false;
              
      while(d>-1)
              
      {
                  
      int type=Character.getType(d);
                  
      if(type==2 || type==9 || type==5)
                  
      {
                      d
      =toAscii(d);
                      cb.append((
      char)d);
                  }

                  
      else
                  
      {
                      b
      =true;
                  }

                  d
      =reader.read();
                  
      if(d==-1 || b)
                  
      {
                      
      if(d==-1) offset++;
                      b
      =false;
                      
      char[] ioBuffer = new char[cb.length()];
                      cb.getChars(
      0, cb.length(), ioBuffer, 0);
                      Sentence sen
      =new Sentence(ioBuffer,offset-cb.length());
                      list.add(sen);
                      cb.setLength(
      0);
                  }

                  offset
      ++;
              }

              
      return list;
          }

          
          
      //將句子切分出詞
          public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
          
      {
              ArrayList
      <Token> tokenlist=new ArrayList<Token>();
              
      for(Sentence sen:list)
              
      {
                  StringBuffer word 
      = new StringBuffer();
                  
      int offset=sen.getStartOffset();
                  
      int bufferIndex = 0;
                  
      char c;
                  
      boolean b=false;
                  
      while(bufferIndex<sen.getText().length)
                  
      {
                      offset
      ++;
                      c
      =sen.getText()[bufferIndex++];
                      
      if(word.length()==0)
                          word.append(c);
                      
      else
                      
      {
                          String temp 
      = (word.toString() + c).intern();
                          
      if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                              word.append(c);
                          
      else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                              word.append(c);
                          
      else
                          
      {
                              bufferIndex
      --;
                              offset
      --;
                              
      while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                              
      {
                                  word.deleteCharAt(word.length()
      -1);
                                  bufferIndex
      --;
                                  offset
      --;
                              }

                              b
      =true;
                          }

                      }

                      
      if(b || bufferIndex==sen.getText().length)
                      
      {
                          Token token 
      = new Token(word.toString(),offset-word.length(),offset,"word");
                          word.setLength(
      0);
                          tokenlist.add(token);
                          b
      =false;
                      }

                  }

              }

              
      return tokenlist;
          }

          
          
      //將相連的單個英文或數(shù)字組合成詞
          public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
          
      {
              ArrayList
      <Token> tokenlist=new ArrayList<Token>();
              Token word
      =null;
              
      for(int i=0;i<list.size();i++)
              
      {
                  Token t
      =list.get(i);
                  
      if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
                  
      {
                      
      if(word==null)
                          word
      =t;
                      
      else if(word.getEnd()==t.getStart())
                      
      {
                          word.setEnd(t.getEnd());
                          word.setWord(word.getWord()
      +t.getWord());
                      }

                      
      else
                      
      {
                          tokenlist.add(word);
                          word
      =t;
                      }

                  }

                  
      else if(word!=null)
                  
      {
                      tokenlist.add(word);
                      word
      =null;
                      tokenlist.add(t);
                  }

                  
      else
                      tokenlist.add(t);
              }

              
      if(word!=null)
                  tokenlist.add(word);
              
      return tokenlist;
          }

          
          
      //雙角轉(zhuǎn)單角
          public static int toAscii(int codePoint) 
          
      {
              
      if((codePoint>=65296 && codePoint<=65305)    //0-9
                      || (codePoint>=65313 && codePoint<=65338)    //A-Z
                      || (codePoint>=65345 && codePoint<=65370)    //a-z
                      )
              
      {    
                  codePoint 
      -= 65248;
              }

              
      return codePoint;
          }

          
          
      //加載詞典
          public static void loadDictionary() 
          
      {  
              
      if (dictionary == null
              
      {    
                  dictionary 
      = new HashMap<String, Integer>();    
                  InputStream is 
      = null;    
                  BufferedReader br 
      = null;            
                  
      try
                  
      {
                      is 
      = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                      br 
      = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                      String word 
      = null;
                      
      while ((word = br.readLine()) != null
                      
      {
                          word
      =word.toLowerCase();
                          
      if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                          
      {
                              dictionary.put(word.intern(), 
      1);    
                              
      int i = word.length()-1
                              
      while(i >= 2)
                              
      {
                                  String temp 
      = word.substring(0, i).intern(); 
                                  
      if (!dictionary.containsKey(temp))
                                      dictionary.put(temp,
      2); 
                                  i
      --;
                              }

                          }

                      }

                  }

                  
      catch (Exception e) 
                  
      {      
                      log.info(e);
                  }

                  
      finally
                  
      {
                      
      try 
                      
      {      
                          
      if(br!=null)
                              br.close();   
                          
      if(is!=null)
                              is.close();  
                      }

                      
      catch (IOException e)
                      
      {     
                          log.info(e);
                      }
                  
                  }
       
              }
       
          }

          
          
      public static String[] segWords(Reader input)
          
      {
              ArrayList
      <String> list=new ArrayList<String>();
              
      try
              
      {
                  MM2 f
      =new MM2(input);
                  ArrayList
      <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
                  
      for(Token t:tlist)
                  
      {
                      list.add(t.getWord());
                  }

              }

              
      catch(IOException e)
              
      {
                  log.info(e);
              }

              
      return (String[])list.toArray(new String[0]);
          }

          
          
      public static void main(String[] args) 
          
      {
              String[] cc
      =MM2.segWords(new StringReader("ibm商務機t60p".toLowerCase()));
              
      for(String c:cc)
              
      {
                  System.out.println(c);
              }

          }

      }
      posted on 2011-08-04 15:31 nianzai 閱讀(1713) 評論(0)  編輯  收藏 所屬分類: 中文分詞

        本站是提供個人知識管理的網(wǎng)絡存儲空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點。請注意甄別內(nèi)容中的聯(lián)系方式、誘導購買等信息,謹防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點擊一鍵舉報。
        轉(zhuǎn)藏 分享 獻花(0

        0條評論

        發(fā)表

        請遵守用戶 評論公約

        類似文章 更多