感谢thinkphp中英分词算法的作者!
最近项目需要做全文检索方面,所以找了thinkphp中英分词算法进行拓展,首先把词库sqlite形式转成了mysql
其次把分词后的中文转变成区位码形式,从而支持mysql的全文检索!
演示:http://www.ye55.cn/fc/fc.php
下载:http://www.ye55.cn/fc/group.zip (内含词库sql文件,包含近30W词的词库,再次感谢yhustc)
主要的文件:WordSegment.class.php
PHP代码
- <?php
- /**
- +——————————————————————————
- * 中英文分词类库
- * 使用正向扫描最大字长匹配算法进行分词,使用未匹配词的队列识别字典中没有的词
- * 提供SQLITE字典查询作为参考,可以自己扩展字典查找的findinDict方法
- +——————————————————————————
- */
- class WordSegment
- {//类定义开始
- // 存放结果的数组
- var $result = array();
- /**
- +———————————————————-
- * 字典的连接句柄和字典查询次数
- +———————————————————-
- * @var integer
- * @access protected
- +———————————————————-
- */
- protected $db;
- var $querytimes = 0;
- protected $enChar = array("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","1","2","3","4","5","6","7","8","9","0","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","0","1","2","3","4","5","6","7","8","9");
- /**
- +———————————————————-
- * 高频词列表,这次字很难单独组词,会干扰分词程序
- +———————————————————-
- * @var array
- * @access protected
- +———————————————————-
- */
- protected $highfreq = array(‘我’,‘是’,‘为’,‘了’,‘的’,‘你’,‘他’,‘她’,‘它’,‘们’,‘这’,‘那’,‘在’,‘和’,‘一’,‘不’,‘有’,‘对’,‘中’,‘这’,‘要’,‘上’,‘也’,‘人’,‘等’,‘说’);
- /**
- +———————————————————-
- * 标点符号列表
- +———————————————————-
- * @var array
- * @access protected
- +———————————————————-
- */
- protected
- $sign = array(‘\r’,‘\n’,‘\t’,‘`’,‘~’,‘!’,‘@’,‘#’,‘$’,‘%’,‘^’,‘&’,‘*’,‘(‘,‘)’,‘-‘,‘_’,‘+’,‘=’,‘|’,‘\\’,’\”,‘"’,‘;’,‘:’,‘/’,‘?’,‘.’,‘>’,‘,’,‘<‘,‘[‘,‘{‘,‘]’,‘}’,‘·’,‘~’,‘!’,‘@’,‘#’,‘¥’,‘%’,‘……’,‘&’,‘×’,‘(’,‘)’,‘-’,‘——’,‘=’,‘+’,‘\’,‘|’,‘【’,‘{’,‘】’,‘}’,‘‘’,‘“’,‘”’,‘;’,‘:’,‘、’,‘?’,‘。’,‘》’,‘,’,‘《’,‘ ‘,‘ ’);
- /**
- +———————————————————-
- * 挂载字典
- +———————————————————-
- * @static
- * @access public
- +———————————————————-
- * @param string $src 字典源
- +———————————————————-
- * @return void
- +———————————————————-
- */
- function openDict($pdo) {
- $this->db= new PDO($pdo[‘dbType’].‘:host=’.$pdo[‘dbHost’].‘;dbname=’.$pdo[‘dbName’], $pdo[‘dbUser’],$pdo[‘dbPass’]);
- $this->db->exec(‘SET NAMES utf8’);
- }
- /**
- +———————————————————-
- * 卸载字典
- +———————————————————-
- * @static
- * @access public
- +———————————————————-
- * @param string $src 字典源
- +———————————————————-
- * @return void
- +———————————————————-
- */
- function closeDict() {
- $this->db=null;
- }
- /**
- +———————————————————-
- * 在字典中查找字串.提供sqlite版本作为参考,用户可以自己扩展
- +———————————————————-
- * @static
- * @access public
- +———————————————————-
- * @param string $string 待查找的字符串
- +———————————————————-
- * @return bool
- +———————————————————-
- */
- function findinDict($string) {
- $this->querytimes++;
- $sql = "SELECT `word` FROM `dict` where `word`=‘".$string."’ limit 1";
- $rs = $this->db->query($sql);
- if ($row=$rs->fetch(PDO::FETCH_ASSOC))
- return true;
- else
- return false;
- }
- /**
- +———————————————————-
- * 用中英文标点对句子进行粗分,划分成短句
- +———————————————————-
- * @static
- * @access public
- +———————————————————-
- * @param string $sentence 完整的句子
- * @param string $minSen 通过标点断句最短的词组长度
- * @param string $saveInter 是否保留标点符号
- * @param string $encoding 文字编码,默认为utf-8
- +———————————————————-
- * @return array
- +———————————————————-
- */
- function cnSplit($sentence, $minSen, $saveInter, $encoding) {
- $len = mb_strlen($sentence,$encoding);
- $substring = array();
- $cnTmpStr = "";
- $enTmpStr = "";
- for($i=0;$i<$len;$i++)
- {
- $char = mb_substr($sentence,$i,1,$encoding);
- if(in_array($char,$this->sign))
- {
- if($cnTmpStr != "")
- { // 一连串的中文放入待分词的词组
- if(mb_strlen(trim($cnTmpStr),$encoding)<=$minSen) // 遇到标点了,根据设置的标点断句最短的词组长度判断是否直接分词
- $substring[] = array(trim($cnTmpStr),’1′);
- else
- $substring[] = array(trim($cnTmpStr),’0′);
- $cnTmpStr = "";
- }
- if($enTmpStr != "")
- { // 一连串的英语字母或数字可以直接返回分词结果
- $substring[] = array(trim($enTmpStr),’1′);
- $enTmpStr = "";
- }
- if($saveInter) // 如果要保留标点可以直接返回分词结果
- $substring[] = array($char,’1′);
- }
- else if(in_array($char,$this->enChar))
- {
- if($cnTmpStr != "")
- { // 遇到英文或数字了,可以给中文句子断句了
- if(mb_strlen(trim($cnTmpStr),$encoding)<=$minSen) // 遇到标点了,根据设置的标点断句最短的词组长度判断是否直接分词
- $substring[] = array(trim($cnTmpStr),’1′);
- else
- $substring[] = array(trim($cnTmpStr),’0′);
- $cnTmpStr = "";
- }
- $enTmpStr .= $char;
- }
- else
- {
- if($enTmpStr != "")
- { // 遇到中文了,可以给英文句子或数字断句了
- $substring[] = array(trim($enTmpStr),’1′);
- $enTmpStr = "";
- }
- $cnTmpStr .= $char;
- }
- }
- // 追加没有添加到子句中的中英文句子
- if($cnTmpStr != "") {
- if($enTmpStr == "" && mb_strlen(trim($cnTmpStr),$encoding)<=$minSen) // 要判断一下后面没有英文词组,这样句子是在没有标点符号的情况下结束了
- $substring[] = array(trim($cnTmpStr),’1′);
- else
- $substring[] = array(trim($cnTmpStr),’0′);
- }
- if($enTmpStr != "") $substring[] = array(trim($enTmpStr),’1′);
- return $substring;
- }
- /**
- +———————————————————-
- * 分词函数
- +———————————————————-
- * @static
- * @access public
- +———————————————————-
- * @param string $sentence 待分词的句子
- * @param string $maxlen 每次取子串最长字数,默认为8个字.越大分词越慢,但是越准确
- * @param string $minSen 通过标点断句最短的词组长度
- * @param string $saveSingle 是否保留不能组词的单个的字
- * @param string $saveInter 是否保留标点符号
- * @param string $encoding 文字编码,默认为utf-8
- * @param string $dict 字典的连接字符串
- +———————————————————-
- * @return array
- +———————————————————-
- */
- function segment($sentence,$pdo=array(‘dbType’=>’mysql’,’dbHost’=>’localhost’,’dbName’=>’furyee’,’dbUser’=>’root’,’dbPass’=>’root’),$maxlen = 8, $minSen = 3, $saveSingle = false, $saveInter = false, $encoding=’utf-8′) {
- $this->openDict($pdo); // 挂载字典
- $this->result = array();
- $this->querytimes = 0;
- $subSens = $this->cnSplit($sentence, $minSen, $saveInter, $encoding); //使用标点将长句分成短句
- foreach($subSens as $item)
- {
- if($item[1] == ‘1’)
- {
- $this->result[] = trim($item[0]);
- continue;
- }
- else
- $subSen = $item[0];
- $bFind = false;
- $i = $j = $N = 0; // i,j是扫描的指针.N是本次扫描的子串字数上界
- $M = $maxlen; // 每次取子串最长字数,默认为8个字.M越大分词越慢,但是越准确
- $tmpStr = ”; //用来记录没有匹配的字,多个连续的未匹配的字认为组合成一个词.
- $sub_str = ”; //每次取的子串
- $senLen = mb_strlen($subSen,$encoding); //字符串长度
- while($i < $senLen) {
- $N = ($i+$M) < $senLen ? $M : $senLen-$i;
- //N是本次扫描的子串字数上界
- $bFind = false;
- for($j = $N; $j > 0; $j–) {
- //取子串到字典中匹配
- $sub_str = mb_substr($subSen,$i,$j,$encoding); //从$i指的地方开始,取$j的长度
- if($this->findinDict($sub_str)) {
- // 字典中有该词
- if(mb_strlen($tmpStr,$encoding) < 2 && !$saveSingle) //临时字符串中只有一个字或没有词
- $tmpStr = ""; //清空它
- else if($tmpStr != "")
- {
- $this->result[] = $tmpStr; //多个连续的没有匹配的字认为他组成一个生词
- $tmpStr = "";
- }
- $this->result[] = $sub_str;
- $bFind = true;
- $i+=$j; //指针后移
- break;
- }
- }
- if(!$bFind) {
- if(in_array($sub_str,$this->highfreq)) //当前单个字无法匹配,而且它是高频词
- {
- if(mb_strlen($tmpStr,$encoding) ==1 && !$saveSingle)
- //临时字符串中只有一个字,遇到高频词可以进行断句,所以要判断一下临时队列
- $tmpStr = ""; //清空它
- else if($tmpStr != "")
- {
- $this->result[] = $tmpStr; //多个连续的没有匹配的字认为他组成一个生词
- $tmpStr = "";
- }
- if($saveSingle) // 如果要保留单个的高频字,将它保留下来,否则剔除
- $this->result[] = $sub_str;
- }
- else
- $tmpStr .= $sub_str; //不是标点,是一个没有匹配的单个的字
- $i++;
- }
- }
- if($tmpStr !="" ) $this->result[] = $tmpStr; // 扫描结束,临时队列还有词,那应该是最后面无法进行分词的一些字
- }
- $this->closeDict(); //卸载字典
- return $this->result;
- }
- function zhcode($sentence,$pdo=array(‘dbType’=>’mysql’,’dbHost’=>’localhost’,’dbName’=>’furyee’,’dbUser’=>’root’,’dbPass’=>’root’),$maxlen = 8, $minSen = 3, $saveSingle = false, $saveInter = false, $encoding=’utf-8′)
- {
- $val=”;
- $arr=$this->segment($sentence,$pdo,$maxlen,$minSen,$saveSingle,$saveInter,$encoding);
- $str=implode(‘ ‘,$arr);
- $strlen=mb_strlen($str,$encoding);
- for($i=0;$i<$strlen;$i++){
- $tmpstr=mb_substr($str,$i,1,$encoding);
- if(strlen($tmpstr)==1){
- $val.=$tmpstr;
- }else{
- //echo $tmpstr.'<br>’;
- $tmpstr=iconv(‘UTF-8′,’GB2312’,$tmpstr);
- $str_qwm = sprintf("%02d%02d",ord($tmpstr[0])-160,ord($tmpstr[1])-160);
- //echo iconv(‘GB2312′,’UTF-8′,$tmpstr).$str_qwm.'<br>’;
- $val.=$str_qwm;
- }
- }
- return array($val,$arr);
- }
- }
- ?>
演示文件: fc.php
PHP代码
- <?php
- $starttime = ExecTime();
- include(‘WordSegment.class.php’);
- $str=‘冻番茄www.phpd.cn<p>现在市面上的樱花啊,爆米花的笔都可以画出美丽可爱的立体图案,可以装饰手机什么的,但是价钱贵,还不方便,比方说爆米花的,就要通过加热才会有效果.</p><p>最好有一种写出来就是爆米花的效果的笔.那就最好了,最好还可以通过不停的温度有不同的颜色(就像会变色的手表)和效果……</p><p><img src="/upload/image/2008-09-01/2008090104363891/55a19b38ee7f6671e64b7ee46feb2b0a.jpg" alt="" /></p>’;
- $ws = new WordSegment; // 实例化一个分词类的对象
- $result = $ws->zhcode($str);
- print_r($result);
- $totaltime =ExecTime()-$starttime ;
- echo "<BR><BR><BR>分词时间: $totaltime 秒<br><br>www.phpd.cn 冻番茄";
- function ExecTime(){
- $time = explode(" ", microtime());
- $usec = (double)$time[0];
- $sec = (double)$time[1];
- return $sec + $usec;
- }
- ?>