Sphinx中文分词在discuz 中的应用。

Sphinx-for-chinese是一款专注于中文搜索的全文检索软件,在sphinx的基础上添加了中文处理模块并优化了中文搜索效果。相比而言,以某某论坛几千万数据环境来看,比Coreseek封装的sphinx中文分词+mmseg3组合要好一点

1.准备环境

cd /var/tmp/
wget http://www.sphinx-search.com/downloads/sphinx-for-chinese-2.2.1-dev-r4311.tar.gz
wget http://docs.zjyxh.com/discuzx/sphinx-for-chinese/xdict
mkdir -p /data0/dzbbs/indexdata/
mkdir -p /data1/dzbbs/indexdata/

2.安装 sphinx for chinese

CXX=gcc
CFLAGS=" -O3 -fomit-frame-pointer -pipe -march=nocona -mfpmath=sse -m128bit-long-double -mmmx -msse -msse2 -maccumulate-outgoing-args -m64 -ftree-loop-linear -fprefetch-l
oop-arrays -freg-struct-return -fgcse-sm -fgcse-las -frename-registers -fforce-addr -fivopts -ftree-vectorize -ftracer -frename-registers -minline-all-stringops -fbranch-target-
load-optimize2"
CXXFLAGS="${CFLAGS}"
export CXX CXXFLAGS CFLAGS
tar zxf sphinx-for-chinese-2.2.1-dev-r4311.tar.gz
cd sphinx-for-chinese-2.2.1-dev-r4311
./configure --prefix=/usr/local/sfc22/ --with-mysql=/usr/local/mysql/ --enable-id64
make -j8 install

3.配置 sphinx for chinese

cd /usr/local/sfc22/etc/

mv xdict xdictbak

cp -a /var/tmp/xdict .

vim  sphinx.conf


source pre_forum_thread

{

    type                    = mysql

    sql_host                = localhost

    sql_user               = sphinx

    sql_pass                = sphinx2013

    sql_db                  = discuzx

    sql_port                = 3306

    sql_query_pre            = SET NAMES UTF8

    sql_query_pre           = SET SESSION query_cache_type=OFF

    sql_query_pre            = REPLACE INTO pre_common_sphinxcounter SELECT 1, MAX(tid) FROM pre_forum_thread

    sql_query                = SELECT t.tid as id,t.tid,t.subject,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \

                                FROM pre_forum_thread  AS t \

                                WHERE t.tid>=$start AND t.tid<=$end

    sql_query_range            = SELECT (SELECT MIN(tid) FROM pre_forum_thread),maxid FROM pre_common_sphinxcounter WHERE indexid=1

    sql_range_step          = 5000

    sql_attr_uint            = tid

    sql_attr_uint            = digest

    sql_attr_uint            = displayorder

    sql_attr_uint            = authorid

    sql_attr_uint            = special

    sql_attr_timestamp        =lastpost

    sql_query_info            = SELECT * FROM pre_forum_thread WHERE tid=$id

}

#threads

index pre_forum_thread

{

    source            = pre_forum_thread

    path            = /data0/dzbbs/indexdata/pre_forum_thread #windows下最好用全路径

    docinfo            = extern

    mlock            = 0

    morphology        = none

    #charset_dictpath = /usr/local/mmseg32/etc/    #BSD、Linux环境下设置,/符号结尾

    #charset_dictpath= etc/                        #Windows环境下设置,/符号结尾

    #charset_debug   =   0

#### 索引的词最小长度

min_word_len = 2

#charset_type = zh_cn.utf-8

html_strip = 1

##### 字符表,注意:如使用这种方式,则sphinx会对中文进行单字切分,

##### 即进行字索引,若要使用中文分词,必须使用其他分词插件如 coreseek,sfc

charset_type = utf-8

chinese_dictionary = /usr/local/sfc22/etc/xdict

min_prefix_len = 0

min_infix_len = 1

ngram_len = 0



}

#threads_minute

source pre_forum_thread_minute : pre_forum_thread

{

    #sql_query_pre            =

    sql_query_pre            = SET NAMES UTF8

    sql_query_pre            = SET SESSION query_cache_type=OFF

    sql_query_range          = SELECT maxid-1,(SELECT MAX(tid) FROM pre_forum_thread) FROM pre_common_sphinxcounter WHERE indexid=1

}

#threads_minute

index pre_forum_thread_minute : pre_forum_thread

{

    source            = pre_forum_thread_minute

    path            = /data0/dzbbs/indexdata/pre_forum_thread_minute #windows下最好用全路径

}

#posts

source pre_forum_post : pre_forum_thread

{

    type                     = mysql

    sql_query_pre            =

    sql_query_pre            = SET NAMES UTF8

    sql_query_pre            = SET SESSION query_cache_type=OFF

    sql_query_pre            = REPLACE INTO pre_common_sphinxcounter SELECT 2, MAX(pid) FROM pre_forum_post

    sql_query                = SELECT p.pid AS id,p.tid,p.subject,p.message,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \

                                FROM pre_forum_post AS p LEFT JOIN pre_forum_thread AS t USING(tid) where p.pid >=$start and p.pid <=$end \

    AND p.first=1 

    sql_query_range          = SELECT (SELECT MIN(pid) FROM pre_forum_post),maxid FROM pre_common_sphinxcounter WHERE indexid=2

    sql_range_step           = 5000

    sql_attr_uint            = tid

    sql_attr_uint            = digest

    sql_attr_uint            = displayorder

    sql_attr_uint            = authorid

    sql_attr_uint            = special

    sql_attr_timestamp       = lastpost

    sql_query_info           = SELECT * FROM pre_forum_post WHERE pid=$id

}

#posts

index pre_forum_post

{

    source            = pre_forum_post

    path            = /data1/dzbbs/indexdata/pre_forum_post #windows下最好用全路径

    docinfo            = extern

    mlock            = 0

    morphology        = none

    #charset_dictpath = /usr/local/mmseg32/etc/    #BSD、Linux环境下设置,/符号结尾

    #charset_dictpath= etc/                        #Windows环境下设置,/符号结尾

    #charset_debug   =   0

#### 索引的词最小长度

min_word_len = 2

#charset_type = zh_cn.utf-8

html_strip = 0

##### 字符表,注意:如使用这种方式,则sphinx会对中文进行单字切分,

##### 即进行字索引,若要使用中文分词,必须使用其他分词插件如 coreseek,sfc

charset_type = utf-8

chinese_dictionary = /usr/local/sfc22/etc/xdict

min_prefix_len = 0

min_infix_len = 1

ngram_len = 0



}

#pre_forum_post_minute

source pre_forum_post_minute : pre_forum_post

{

    sql_query_pre            = SET NAMES UTF8

    sql_query_pre           = SET SESSION query_cache_type=OFF

    sql_query_range            = SELECT maxid-1,(SELECT MAX(pid) FROM pre_forum_post) FROM pre_common_sphinxcounter WHERE indexid=2

}

#pre_forum_post_minute

index pre_forum_post_minute : pre_forum_post

{

    source            = pre_forum_post

    path            = /data0/dzbbs/indexdata/pre_forum_post #windows下最好用全路径

}

#全局indexer定义

indexer

{

    mem_limit            = 2047M

    write_buffer         = 64M

}

#searchd服务定义

searchd

{

    listen              = 3312

    read_timeout        = 5

    max_children        = 30

    max_matches         = 500

    listen_backlog      = 20

    seamless_rotate     = 0

    preopen_indexes     = 0

    max_filter_values   = 16384

    max_filters         = 1024

    mva_updates_pool    = 16M

    max_packet_size     = 32M

    read_buffer         = 1M

    unlink_old          = 1

    pid_file            = /usr/local/sfc22/var/log/searchd_discuzx.pid #windows下最好用全路径

    log                 = /usr/local/sfc22/var/log/searchd_discuzx.log #windows下最好用全路径

    query_log           = /usr/local/sfc22/var/log/query_discuzx.log #windows下最好用全路径

}


4 测试sfc 中文分词效果。

/usr/local/sfc22/bin/search -c /usr/local/sfc22/etc/sphinx.conf 分享精彩


words:

1. '分享': 194 documents, 266 hits

2. '精彩': 368 documents, 425 hits

3. '精彩视频': 2 documents, 2 hits

4. '精彩节目': 2 documents, 2 hits

5. '精彩绝伦': 2 documents, 2 hits

6. '精彩纷呈': 7 documents, 7 hits

7. '精彩瞬间': 5 documents, 5 hits

8. '精彩回顾': 2 documents, 3 hits

可以看出sfc 中文分词效果还是比较准确的。

5 启动sfc

/usr/local/sfc22/bin/searchd -c /usr/local/sfc22/etc/sphinx.conf

6 编写主索引和增量索引脚本

1)编写主题表主索引和增量索引以及帖子表增量索引,每20分钟运行一次。

vim /usr/local/bin/bbssearch.sh

#!/bin/bash
#desc: this scripts for sphinx for chinese index
#date:2014.12.23
#testd in CentOS 6.5 x86_64
#saved in /usr/local/bin/bbssearch.sh
#written by coralzd@gmail.com coralzd.blog.51cto.com blog.zjyxh.com
export PATH=/usr/local/sfc22/bin/:/sbin/:$PATH
Y=$(date +%Y)
m=$(date +%m)
d=$(date +%d)
# create thread merge index
echo "-- thread main index start `date +${Y}-${m}-${d}.%H:%M:%S` --"  >> /var/log/sphinx.log
indexer pre_forum_thread_minute --rotate
echo "-- thread merge index end `date +${Y}-${m}-${d}.%H:%M:%S` --"  >> /var/log/sphinx.log
# create post merge index
echo "-- post main index start `date +${Y}-$m-$d.%H:%M:%S` --"  >> /var/log/sphinx.log
indexer pre_forum_post_minute --rotate
echo "-- post merge index end `date +${Y}-$m-$d.%H:%M:%S` --"  >> /var/log/sphinx.log
# main  index + merge index thread
echo "-- thread main merge + index start `date +${Y}-${m}-${d}.%H:%M:%S` --"  >> /var/log/sphinx.log
indexer --merge pre_forum_thread pre_forum_thread_minute --merge-dst-range deleted 0 0 --rotate
echo "-- thread main merge + index end `date +${Y}-${m}-${d}.%H:%M:%S` --"  >> /var/log/sphinx.log

2)编写帖子表主索引,每周运行一次。

vim /usr/local/bin/bbssearch2.sh

#!/bin/bash
#desc: this scripts for sphinx for chinese index
#date:2014.12.23
#testd in CentOS 6.5 x86_64
#saved in /usr/local/bin/bbssearch2.sh
#written by coralzd@gmail.com coralzd.blog.51cto.com blog.zjyxh.com
export PATH=/usr/local/sfc22/bin/:/sbin/:$PATH
Y=$(date +%Y)
m=$(date +%m)
d=$(date +%d)
# create post merge index
echo "-- post main index start `date +${Y}-$m-$d.%H:%M:%S` --"  >> /var/log/sphinx.log
indexer pre_forum_post --rotate
echo "-- post merge index end `date +${Y}-$m-$d.%H:%M:%S` --"  >> /var/log/sphinx.log


将其放到crontab 计划任务。