关于elasticsearch-ik分词器与dynamic-synonym同义词的一些疑惑

最近在试验分词器，希望的效果是可以对一个词进行ik_smart切分，之后再转换为其同义词与拼音。

ES版本为7.9.0，分词器结构如下所示：

"settings":{

"analysis":{

"filter":{

"synonym_filter":{

"type" : "dynamic_synonym",

"synonyms_path" : "fromDB",

"interval" : 60

},

"pinyin_filter":{

"type":"pinyin",

"keep_first_letter" : true,

"keep_separate_first_letter" : false,

"limit_first_letter_length" : 50,

"keep_full_pinyin" : false,

"keep_joined_full_pinyin" : true,

"keep_none_chinese_together": true,

"keep_none_chinese_in_first_letter" : true,

"keep_none_chinese_in_joined_full_pinyin" : true,

"none_chinese_pinyin_tokenize" : false,

"keep_original" : true,

"lowercase" : true

}

},

"char_filter":{

"symbol_filter": {

"type": "pattern_replace",

"pattern": "[`~!@#$^&*()=|{}:;,\\[\\].<>/?~！@#￥……&*（）——|{}【】‘；：”“。，、？%+_]",

"replacement": ""

},

"t2s_filter": {

"type" : "stconvert",

"keep_both" : false,

"convert_type" : "t2s"

}

},

"analyzer":{

"dd_analyzer":{

"type": "custom",

"tokenizer":"ik_smart",

"filter":["synonym_filter","pinyin_filter"],

"char_filter":["symbol_filter","t2s_filter"]

}

}

}

}

该分词器分为三个部分，按照es index的顺序，可简述为：
原始文档 -> char_filter阶段(去除指定符号、简繁转换) -> tokenizer阶段(ik_smart分词器) -> token filter阶段(同义词转换，拼音转换)。

但是实际测试结果却与我期望的有所偏差，希望大家可以帮我看下。

现在我输入一个词“马自达三代”，我希望通过我自定义的dd_analyzer分词器，可以返回“昂克塞拉”以及对应的拼音给我。

1、测试在不添加扩展词与同义词的情况下，dd_analyzer的分词效果：

{

"analyzer":"dd_analyzer",

"text":"马自达三代"

}

'

{

  "tokens" : [

    {

      "token" : "马自达",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "CN_WORD",

      "position" : 0

    },

    {

      "token" : "mazida",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "CN_WORD",

      "position" : 0

    },

    {

      "token" : "mzd",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "CN_WORD",

      "position" : 0

    },

    {

      "token" : "三代",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "CN_WORD",

      "position" : 1

    },

    {

      "token" : "sandai",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "CN_WORD",

      "position" : 1

    },

    {

      "token" : "sd",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "CN_WORD",

      "position" : 1

    }

  ]

}

{

"analyzer":"dd_analyzer",

"text":"昂克塞拉"

}

'

{

  "tokens" : [

    {

      "token" : "昂",

      "start_offset" : 0,

      "end_offset" : 1,

      "type" : "CN_CHAR",

      "position" : 0

    },

    {

      "token" : "ang",

      "start_offset" : 0,

      "end_offset" : 1,

      "type" : "CN_CHAR",

      "position" : 0

    },

    {

      "token" : "a",

      "start_offset" : 0,

      "end_offset" : 1,

      "type" : "CN_CHAR",

      "position" : 0

    },

    {

      "token" : "克",

      "start_offset" : 1,

      "end_offset" : 2,

      "type" : "CN_CHAR",

      "position" : 1

    },

    {

      "token" : "ke",

      "start_offset" : 1,

      "end_offset" : 2,

      "type" : "CN_CHAR",

      "position" : 1

    },

    {

      "token" : "k",

      "start_offset" : 1,

      "end_offset" : 2,

      "type" : "CN_CHAR",

      "position" : 1

    },

    {

      "token" : "塞",

      "start_offset" : 2,

      "end_offset" : 3,

      "type" : "CN_CHAR",

      "position" : 2

    },

    {

      "token" : "sai",

      "start_offset" : 2,

      "end_offset" : 3,

      "type" : "CN_CHAR",

      "position" : 2

    },

    {

      "token" : "s",

      "start_offset" : 2,

      "end_offset" : 3,

      "type" : "CN_CHAR",

      "position" : 2

    },

    {

      "token" : "拉",

      "start_offset" : 3,

      "end_offset" : 4,

      "type" : "CN_CHAR",

      "position" : 3

    },

    {

      "token" : "la",

      "start_offset" : 3,

      "end_offset" : 4,

      "type" : "CN_CHAR",

      "position" : 3

    },

    {

      "token" : "l",

      "start_offset" : 3,

      "end_offset" : 4,

      "type" : "CN_CHAR",

      "position" : 3

    }

  ]

}

2、在同义词库中，输入马自达三代 => 昂克塞拉，并等待同义词库更新完毕：

{

"analyzer":"dd_analyzer",

"text":"马自达三代"

}

'

{

  "tokens" : [

    {

      "token" : "昂",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "ang",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "a",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "克",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "ke",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "k",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "塞",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "sai",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "s",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "拉",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    },

    {

      "token" : "la",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    },

    {

      "token" : "l",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    }

  ]

}

说实话，到这里我就已经疑惑了：
同义词转换位于token filter阶段，其理应发生在tokenizer阶段之后，而根据第一步，“马自达三代”经过tokenizer的ik_smart分词器，应该被切分为“马自达”和“三代”两个词。
也就是说，token filter中的同义词转换器拿到的应该是“马自达”和“三代”两个词，而不是一个完整的词“马自达三代”，那么它是根据什么把两个词合并为一个词，又转换成“昂克塞拉”的呢？
其次，为什么同义词解析出来的“昂克塞拉”不是一个完整的词，而是“昂”、“克”、“塞”、“拉”四个词呢？
我的理解是，同义词转换出一个新的词后，又紧接着对这个新的词，使用dd_analyzer做了一次解析。
那么是不是只要我再把“昂克塞拉”这个词加入ik的扩展词库，同义词转换出来的“昂克塞拉”就不会被切分了呢？

3、将“昂克塞拉”加入扩展词库，再调用dd_analyzer验证结果：

{

"analyzer":"dd_analyzer",

"text":"昂克塞拉"

}

'

{

  "tokens" : [

    {

      "token" : "昂克塞拉",

      "start_offset" : 0,

      "end_offset" : 4,

      "type" : "CN_WORD",

      "position" : 0

    },

    {

      "token" : "angkesaila",

      "start_offset" : 0,

      "end_offset" : 4,

      "type" : "CN_WORD",

      "position" : 0

    },

    {

      "token" : "aksl",

      "start_offset" : 0,

      "end_offset" : 4,

      "type" : "CN_WORD",

      "position" : 0

    }

  ]

}

从验证结果来看，现在dd_analyzer已经可以将“昂克塞拉”识别为一个完整的词了。
那么此时，我使用dd_analyzer解析“马自达三代”，会首先被同义词转换为“昂克塞拉”，接着被dd_analyzer解析为完整的“昂克塞拉”。
这个逻辑应该没错吧？

4、使用dd_analyzer，再一次解析“马自达三代”：

{

"analyzer":"dd_analyzer",

"text":"马自达三代"

}

'

{

  "tokens" : [

    {

      "token" : "昂",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "ang",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "a",

      "start_offset" : 0,

      "end_offset" : 3,

      "type" : "SYNONYM",

      "position" : 0

    },

    {

      "token" : "克",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "ke",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "k",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 1

    },

    {

      "token" : "塞",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "sai",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "s",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 2

    },

    {

      "token" : "拉",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    },

    {

      "token" : "la",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    },

    {

      "token" : "l",

      "start_offset" : 3,

      "end_offset" : 5,

      "type" : "SYNONYM",

      "position" : 3

    }

  ]

}

如图，“马自达三代”在被同义词转换为“昂克塞拉”之后，依然还是进行了分词，我实在想不通了，dd_analyzer和ik_smart，此时都不会将“昂克塞拉”进行切词，那这一次的切词，ES是调用了哪个奇怪的分词器，又是为什么会调用它呢？

希望小伙伴可以帮忙看一下……这个显示结果我真的非常不理解。

PS：将 马自达三代 => 昂克塞拉 从同义词库中移除，然后再重新添加，显示结果就完全正确了……这到底是怎么回事啊orz

关于elasticsearch-ik分词器与dynamic-synonym同义词的一些疑惑

0 个回复

发起人

相关问题

问题状态

关于elasticsearch-ik分词器与dynamic-synonym同义词的一些疑惑

与内容相关的链接

0 个回复

发起人

相关问题

问题状态