Sphinx中文分词在discuz 中的应用。
Sphinx-for-chinese是一款专注于中文搜索的全文检索软件,在sphinx的基础上添加了中文处理模块并优化了中文搜索效果。相比而言,以某某论坛几千万数据环境来看,比Coreseek封装的sphinx中文分词+mmseg3组合要好一点
1.准备环境
cd /var/tmp/ wget http://www.sphinx-search.com/downloads/sphinx-for-chinese-2.2.1-dev-r4311.tar.gz wget http://docs.zjyxh.com/discuzx/sphinx-for-chinese/xdict mkdir -p /data0/dzbbs/indexdata/ mkdir -p /data1/dzbbs/indexdata/
2.安装 sphinx for chinese
CXX=gcc CFLAGS=" -O3 -fomit-frame-pointer -pipe -march=nocona -mfpmath=sse -m128bit-long-double -mmmx -msse -msse2 -maccumulate-outgoing-args -m64 -ftree-loop-linear -fprefetch-l oop-arrays -freg-struct-return -fgcse-sm -fgcse-las -frename-registers -fforce-addr -fivopts -ftree-vectorize -ftracer -frename-registers -minline-all-stringops -fbranch-target- load-optimize2" CXXFLAGS="${CFLAGS}" export CXX CXXFLAGS CFLAGS tar zxf sphinx-for-chinese-2.2.1-dev-r4311.tar.gz cd sphinx-for-chinese-2.2.1-dev-r4311 ./configure --prefix=/usr/local/sfc22/ --with-mysql=/usr/local/mysql/ --enable-id64 make -j8 install
3.配置 sphinx for chinese
cd /usr/local/sfc22/etc/
mv xdict xdictbak
cp -a /var/tmp/xdict .
vim sphinx.conf
source pre_forum_thread
{
type = mysql
sql_host = localhost
sql_user = sphinx
sql_pass = sphinx2013
sql_db = discuzx
sql_port = 3306
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 1, MAX(tid) FROM pre_forum_thread
sql_query = SELECT t.tid as id,t.tid,t.subject,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
FROM pre_forum_thread AS t \
WHERE t.tid>=$start AND t.tid<=$end
sql_query_range = SELECT (SELECT MIN(tid) FROM pre_forum_thread),maxid FROM pre_common_sphinxcounter WHERE indexid=1
sql_range_step = 5000
sql_attr_uint = tid
sql_attr_uint = digest
sql_attr_uint = displayorder
sql_attr_uint = authorid
sql_attr_uint = special
sql_attr_timestamp =lastpost
sql_query_info = SELECT * FROM pre_forum_thread WHERE tid=$id
}
#threads
index pre_forum_thread
{
source = pre_forum_thread
path = /data0/dzbbs/indexdata/pre_forum_thread #windows下最好用全路径
docinfo = extern
mlock = 0
morphology = none
#charset_dictpath = /usr/local/mmseg32/etc/ #BSD、Linux环境下设置,/符号结尾
#charset_dictpath= etc/ #Windows环境下设置,/符号结尾
#charset_debug = 0
#### 索引的词最小长度
min_word_len = 2
#charset_type = zh_cn.utf-8
html_strip = 1
##### 字符表,注意:如使用这种方式,则sphinx会对中文进行单字切分,
##### 即进行字索引,若要使用中文分词,必须使用其他分词插件如 coreseek,sfc
charset_type = utf-8
chinese_dictionary = /usr/local/sfc22/etc/xdict
min_prefix_len = 0
min_infix_len = 1
ngram_len = 0
}
#threads_minute
source pre_forum_thread_minute : pre_forum_thread
{
#sql_query_pre =
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_range = SELECT maxid-1,(SELECT MAX(tid) FROM pre_forum_thread) FROM pre_common_sphinxcounter WHERE indexid=1
}
#threads_minute
index pre_forum_thread_minute : pre_forum_thread
{
source = pre_forum_thread_minute
path = /data0/dzbbs/indexdata/pre_forum_thread_minute #windows下最好用全路径
}
#posts
source pre_forum_post : pre_forum_thread
{
type = mysql
sql_query_pre =
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 2, MAX(pid) FROM pre_forum_post
sql_query = SELECT p.pid AS id,p.tid,p.subject,p.message,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
FROM pre_forum_post AS p LEFT JOIN pre_forum_thread AS t USING(tid) where p.pid >=$start and p.pid <=$end \
AND p.first=1
sql_query_range = SELECT (SELECT MIN(pid) FROM pre_forum_post),maxid FROM pre_common_sphinxcounter WHERE indexid=2
sql_range_step = 5000
sql_attr_uint = tid
sql_attr_uint = digest
sql_attr_uint = displayorder
sql_attr_uint = authorid
sql_attr_uint = special
sql_attr_timestamp = lastpost
sql_query_info = SELECT * FROM pre_forum_post WHERE pid=$id
}
#posts
index pre_forum_post
{
source = pre_forum_post
path = /data1/dzbbs/indexdata/pre_forum_post #windows下最好用全路径
docinfo = extern
mlock = 0
morphology = none
#charset_dictpath = /usr/local/mmseg32/etc/ #BSD、Linux环境下设置,/符号结尾
#charset_dictpath= etc/ #Windows环境下设置,/符号结尾
#charset_debug = 0
#### 索引的词最小长度
min_word_len = 2
#charset_type = zh_cn.utf-8
html_strip = 0
##### 字符表,注意:如使用这种方式,则sphinx会对中文进行单字切分,
##### 即进行字索引,若要使用中文分词,必须使用其他分词插件如 coreseek,sfc
charset_type = utf-8
chinese_dictionary = /usr/local/sfc22/etc/xdict
min_prefix_len = 0
min_infix_len = 1
ngram_len = 0
}
#pre_forum_post_minute
source pre_forum_post_minute : pre_forum_post
{
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_range = SELECT maxid-1,(SELECT MAX(pid) FROM pre_forum_post) FROM pre_common_sphinxcounter WHERE indexid=2
}
#pre_forum_post_minute
index pre_forum_post_minute : pre_forum_post
{
source = pre_forum_post
path = /data0/dzbbs/indexdata/pre_forum_post #windows下最好用全路径
}
#全局indexer定义
indexer
{
mem_limit = 2047M
write_buffer = 64M
}
#searchd服务定义
searchd
{
listen = 3312
read_timeout = 5
max_children = 30
max_matches = 500
listen_backlog = 20
seamless_rotate = 0
preopen_indexes = 0
max_filter_values = 16384
max_filters = 1024
mva_updates_pool = 16M
max_packet_size = 32M
read_buffer = 1M
unlink_old = 1
pid_file = /usr/local/sfc22/var/log/searchd_discuzx.pid #windows下最好用全路径
log = /usr/local/sfc22/var/log/searchd_discuzx.log #windows下最好用全路径
query_log = /usr/local/sfc22/var/log/query_discuzx.log #windows下最好用全路径
}
4 测试sfc 中文分词效果。
/usr/local/sfc22/bin/search -c /usr/local/sfc22/etc/sphinx.conf 分享精彩
words:
1. '分享': 194 documents, 266 hits
2. '精彩': 368 documents, 425 hits
3. '精彩视频': 2 documents, 2 hits
4. '精彩节目': 2 documents, 2 hits
5. '精彩绝伦': 2 documents, 2 hits
6. '精彩纷呈': 7 documents, 7 hits
7. '精彩瞬间': 5 documents, 5 hits
8. '精彩回顾': 2 documents, 3 hits
可以看出sfc 中文分词效果还是比较准确的。
5 启动sfc
/usr/local/sfc22/bin/searchd -c /usr/local/sfc22/etc/sphinx.conf
6 编写主索引和增量索引脚本
1)编写主题表主索引和增量索引以及帖子表增量索引,每20分钟运行一次。
vim /usr/local/bin/bbssearch.sh
#!/bin/bash #desc: this scripts for sphinx for chinese index #date:2014.12.23 #testd in CentOS 6.5 x86_64 #saved in /usr/local/bin/bbssearch.sh #written by coralzd@gmail.com coralzd.blog.51cto.com blog.zjyxh.com export PATH=/usr/local/sfc22/bin/:/sbin/:$PATH Y=$(date +%Y) m=$(date +%m) d=$(date +%d) # create thread merge index echo "-- thread main index start `date +${Y}-${m}-${d}.%H:%M:%S` --" >> /var/log/sphinx.log indexer pre_forum_thread_minute --rotate echo "-- thread merge index end `date +${Y}-${m}-${d}.%H:%M:%S` --" >> /var/log/sphinx.log # create post merge index echo "-- post main index start `date +${Y}-$m-$d.%H:%M:%S` --" >> /var/log/sphinx.log indexer pre_forum_post_minute --rotate echo "-- post merge index end `date +${Y}-$m-$d.%H:%M:%S` --" >> /var/log/sphinx.log # main index + merge index thread echo "-- thread main merge + index start `date +${Y}-${m}-${d}.%H:%M:%S` --" >> /var/log/sphinx.log indexer --merge pre_forum_thread pre_forum_thread_minute --merge-dst-range deleted 0 0 --rotate echo "-- thread main merge + index end `date +${Y}-${m}-${d}.%H:%M:%S` --" >> /var/log/sphinx.log
2)编写帖子表主索引,每周运行一次。
vim /usr/local/bin/bbssearch2.sh
#!/bin/bash #desc: this scripts for sphinx for chinese index #date:2014.12.23 #testd in CentOS 6.5 x86_64 #saved in /usr/local/bin/bbssearch2.sh #written by coralzd@gmail.com coralzd.blog.51cto.com blog.zjyxh.com export PATH=/usr/local/sfc22/bin/:/sbin/:$PATH Y=$(date +%Y) m=$(date +%m) d=$(date +%d) # create post merge index echo "-- post main index start `date +${Y}-$m-$d.%H:%M:%S` --" >> /var/log/sphinx.log indexer pre_forum_post --rotate echo "-- post merge index end `date +${Y}-$m-$d.%H:%M:%S` --" >> /var/log/sphinx.log
将其放到crontab 计划任务。