本文实例讲述了php实现的中文分词类。分享给大家供大家参考,具体如下:
该中文分词类源码使用http://tools.jb51.net/code/jb51_php_format进行了格式化处理,便于阅读。具体代码如下:
class Segmentation {var $options = array("lowercase" => TRUE, "segment_english" => FALSE);var $dict_name = "Unknown";var $dict_words = array();function setLowercase($value) {if ($value) {$this->options["lowercase"] = TRUE;} else {$this->options["lowercase"] = FALSE;}return TRUE;}function setSegmentEnglish($value) {if ($value) {$this->options["segment_english"] = TRUE;} else {$this->options["segment_english"] = FALSE;}return TRUE;}function load($dict_file) {if (!file_exists($dict_file)) {return FALSE;}$fp = fopen($dict_file, "r");$temp = fgets($fp, 1024);if ($temp === FALSE) {return FALSE;} else {if (strpos($temp, " ") !== FALSE) {list ($dict_type, $dict_name) = explode(" ", trim($temp));} else {$dict_type = trim($temp);$dict_name = "Unknown";}$this->dict_name = $dict_name;if ($dict_type !== "DICT_WORD_W") {return FALSE;}}while (!feof($fp)) {$this->dict_words[rtrim(fgets($fp, 32))] = 1;}fclose($fp);return TRUE;}function getDictName() {return $this->dict_name;}function segmentString($str) {if (count($this->dict_words) === 0) {return FALSE;}$lines = explode("
", $str);return $this->_segmentLines($lines);}function segmentFile($filename) {if (count($this->dict_words) === 0) {return FALSE;}$lines = file($filename);return $this->_segmentLines($lines);}function _segmentLines($lines) {$contents_segmented = "";foreach ($lines as $line) {$contents_segmented .= $this->_segmentLine(rtrim($line)) . "
";}do {$contents_segmented = str_replace(" ", " ", $contents_segmented);}while (strpos($contents_segmented, " ") !== FALSE);return $contents_segmented;}function _segmentLine($str) {$str_final = "";$str_array = array();$str_length = strlen($str);if ($str_length > 0) {if (ord($str{$str_length-1}) >= 129) {$str .= " ";}}for ($i=0; $i<$str_length; $i++) {if (ord($str{$i}) >= 129) {$str_array[] = $str{$i} . $str{$i+1};$i++;} else {$str_tmp = $str{$i};for ($j=$i+1; $j<$str_length; $j++) {if (ord($str{$j}) < 129) {$str_tmp .= $str{$j};} else {break;}}$str_array[] = array($str_tmp);$i = $j - 1;}}$pos = count($str_array);while ($pos > 0) {$char = $str_array[$pos-1];if (is_array($char)) {$str_final_tmp = $char[0];if ($this->options["segment_english"]) {$str_final_tmp = preg_replace("/([!"#$\%&"()*+,-./:;<=>?@[\\]^\_`{|}~ f]+)/", " $1 ", $str_final_tmp); $str_final_tmp = preg_replace("/([!"#$\%&"()*+,-./:;<=>?@[\\]^\_`{|}~ f])([!"#$\%&"()*+,-./:;<=>?@[\\]^\_`{|}~ f])/", " $1 $2 ", $str_final_tmp);}if ($this->options["lowercase"]) {$str_final_tmp = strtolower($str_final_tmp);}$str_final = " $str_final_tmp$str_final";$pos--;} else {$word_found = 0;$word_array = array(0 => "");if ($pos < 4) {$word_temp = $pos + 1;} else {$word_temp = 5;}for ($i=1; $i<$word_temp; $i++) {$word_array[$i] = $str_array[$pos-$i] . $word_array[$i-1];}for ($i=($word_temp-1); $i>1; $i--) {if (array_key_exists($word_array[$i], $this->dict_words)) {$word_found = $i;break;}}if ($word_found) {$str_final = " $word_array[$word_found]$str_final";$pos = $pos - $word_found;} else {$str_final = " $char$str_final";$pos--;}}}return $str_final;}}?>
更多关于PHP相关内容感兴趣的读者可查看本站专题:《php常用函数与技巧总结》、《php字符串(string)用法总结》、《PHP数组(Array)操作技巧大全》、《PHP基本语法入门教程》、《php+mysql数据库操作入门教程》及《php常见数据库操作技巧汇总》
希望本文所述对大家PHP程序设计有所帮助。