PHP实现的中文分词类完整实例有哪些?

PHP实现的中文分词类完整实例有哪些?

本文实例讲述了php实现的中文分词类。分享给大家供大家参考,具体如下:

该中文分词类源码使用http://tools.jb51.net/code/jb51_php_format进行了格式化处理,便于阅读。具体代码如下:

class Segmentation {
 var $options = array('lowercase' => TRUE, 
 'segment_english' => FALSE);
 var $dict_name = 'Unknown';
 var $dict_words = array();
 function setLowercase($value) {
  if ($value) {
   $this->options['lowercase'] = TRUE;
  } else {
   $this->options['lowercase'] = FALSE;
  }
  return TRUE;
 }
 function setSegmentEnglish($value) {
  if ($value) {
   $this->options['segment_english'] = TRUE;
  } else {
   $this->options['segment_english'] = FALSE;
  }
  return TRUE;
 }
 function load($dict_file) {
  if (!file_exists($dict_file)) {
   return FALSE;
  }
  $fp = fopen($dict_file, 'r');
  $temp = fgets($fp, 1024);
  if ($temp === FALSE) {
   return FALSE;
  } else {
   if (strpos($temp, "\t") !== FALSE) {
    list ($dict_type, $dict_name) = explode("\t", trim($temp));
   } else {
    $dict_type = trim($temp);
    $dict_name = 'Unknown';
   }
   $this->dict_name = $dict_name;
   if ($dict_type !== 'DICT_WORD_W') {
    return FALSE;
   }
  }
  while (!feof($fp)) {
   $this->dict_words[rtrim(fgets($fp, 32))] = 1;
  }
  fclose($fp);
  return TRUE;
 }
 function getDictName() {
  return $this->dict_name;
 }
 function segmentString($str) {
  if (count($this->dict_words) === 0) {
   return FALSE;
  }
  $lines = explode("\n", $str);
  return $this->_segmentLines($lines);
 }
 function segmentFile($filename) {
  if (count($this->dict_words) === 0) {
   return FALSE;
  }
  $lines = file($filename);
  return $this->_segmentLines($lines);
 }
 function _segmentLines($lines) {
  $contents_segmented = '';
  foreach ($lines as $line) {
   $contents_segmented .= $this->_segmentLine(rtrim($line)) . " \n";
  }
  do {
   $contents_segmented = str_replace(' ', ' ', $contents_segmented);
  }
  while (strpos($contents_segmented, ' ') !== FALSE);
  return $contents_segmented;
 }
 function _segmentLine($str) {
  $str_final = '';
  $str_array = array();
  $str_length = strlen($str);
  if ($str_length > 0) {
   if (ord($str{$str_length-1}) >= 129) {
    $str .= ' ';
   }
  }
  for ($i=0; $i<$str_length; $i++) {
   if (ord($str{$i}) >= 129) {
    $str_array[] = $str{$i} . $str{$i+1};
    $i++;
   } else {
    $str_tmp = $str{$i};
    for ($j=$i+1; $j<$str_length; $j++) {
     if (ord($str{$j}) < 129) {
      $str_tmp .= $str{$j};
     } else {
      break;
     }
    }
    $str_array[] = array($str_tmp);
    $i = $j - 1;
   }
  }
  $pos = count($str_array);
  while ($pos > 0) {
   $char = $str_array[$pos-1];
   if (is_array($char)) {
    $str_final_tmp = $char[0];
    if ($this->options['segment_english']) {
     $str_final_tmp = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/", " $1 ", $str_final_tmp); 
$str_final_tmp = preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/", " $1 $2 ", $str_final_tmp);
    }
    if ($this->options['lowercase']) {
     $str_final_tmp = strtolower($str_final_tmp);
    }
    $str_final = " $str_final_tmp$str_final";
    $pos--;
   } else {
    $word_found = 0;
    $word_array = array(0 => '');
    if ($pos < 4) {
     $word_temp = $pos + 1;
    } else {
     $word_temp = 5;
    }
    for ($i=1; $i<$word_temp; $i++) {
     $word_array[$i] = $str_array[$pos-$i] . $word_array[$i-1];
    }
    for ($i=($word_temp-1); $i>1; $i--) {
     if (array_key_exists($word_array[$i], $this->dict_words)) {
      $word_found = $i;
      break;
     }
    }
    if ($word_found) {
     $str_final = " $word_array[$word_found]$str_final";
     $pos = $pos - $word_found;
    } else {
     $str_final = " $char$str_final";
     $pos--;
    }
   }
  }
  return $str_final;
 }
}
?>

希望本文所述对大家PHP程序设计有所帮助。