当内容中出现特殊字符比如 “-”的时候，拼音分词写入失败

Elasticsearch | 作者 slairmy | 发布于2022年11月29日 | 阅读数：2336

ES 版本 7.7.1
pinying 插件版本 7.7.1

startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=0,endOffset=2,lastStartOffset=7 for field 'user_name_ik_pinyin'

mapping

"mappings" : {

      "properties" : {

        "avatar" : {

          "type" : "keyword",

          "index" : false

        },

        "department_id" : {

          "type" : "keyword"

        },

        "department_name" : {

          "type" : "text",

          "analyzer" : "ik_max_word"

        },

        "id" : {

          "type" : "keyword"

        },

        "job_number" : {

          "type" : "keyword"

        },

        "job_position" : {

          "type" : "text",

          "analyzer" : "ik_max_word"

        },

        "user_id" : {

          "type" : "keyword"

        },

        "user_name_ik_pinyin" : {

          "type" : "text",

          "analyzer" : "ik_max_word_pinyin"

        },

        "user_name_standard" : {

          "type" : "text",

          "analyzer" : "standard"

        }

      }

    }

当我执行分词
GET test_name/_analyze
{
"analyzer": "ik_max_word_pinyin",
"text": "kk-lin-test"
}

下面是分词结果

{

  "tokens" : [

    {

      "token" : "k",

      "start_offset" : 0,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 0

    },

    {

      "token" : "lin",

      "start_offset" : 0,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 2

    },

    {

      "token" : "te",

      "start_offset" : 0,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 3

    },

    {

      "token" : "s",

      "start_offset" : 0,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 4

    },

    {

      "token" : "t",

      "start_offset" : 0,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 5

    },

    {

      "token" : "kk",

      "start_offset" : 0,

      "end_offset" : 2,

      "type" : "LETTER",

      "position" : 6

    },

    {

      "token" : "lin",

      "start_offset" : 3,

      "end_offset" : 6,

      "type" : "LETTER",

      "position" : 7

    },

    {

      "token" : "test",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 8

    },

    {

      "token" : "kklintest",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "LETTER",

      "position" : 8

    },

    {

      "token" : "k",

      "start_offset" : 0,

      "end_offset" : 2,

      "type" : "ENGLISH",

      "position" : 9

    },

    {

      "token" : "kk",

      "start_offset" : 0,

      "end_offset" : 2,

      "type" : "ENGLISH",

      "position" : 9

    },

    {

      "token" : "lin",

      "start_offset" : 3,

      "end_offset" : 6,

      "type" : "ENGLISH",

      "position" : 10

    },

    {

      "token" : "te",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "ENGLISH",

      "position" : 11

    },

    {

      "token" : "s",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "ENGLISH",

      "position" : 12

    },

    {

      "token" : "t",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "ENGLISH",

      "position" : 13

    },

    {

      "token" : "test",

      "start_offset" : 7,

      "end_offset" : 11,

      "type" : "ENGLISH",

      "position" : 13

    }

  ]

}

好像是因为kk这个重复了，出现了相同的 start_offset 和 end_offset 但是 position 不同导致的，查了一下资料说要注释掉部分代码重新编译（6.8 版本的一个方案）

请问一下有其他的方式可以在 ES层面去解决这个问题嘛？

1 个回复

slairmy

排查了好久，发现是settings的一个配置解决的，原settings

{

    "mappings": {

        "properties": {

            "id": {

                "type": "keyword"

            },

            "user_id": {

                "type": "keyword"

            },

            "user_name_ik_pinyin": {

                "type": "text",

                "analyzer": "ik_max_word_pinyin",

                "search_analyzer": "ik_max_word_pinyin"

            },

            "user_name_standard": {

                "type": "text",

                "analyzer": "standard",

                "search_analyzer": "standard"

            },

            "department_id": {

                "type": "keyword"

            },

            "department_name": {

                "type": "text",

                "analyzer": "ik_max_word",

                "search_analyzer": "ik_max_word"

            },

            "job_number": {

                "type": "keyword"

            },

            "job_position": {

                "type": "text",

                "analyzer": "ik_max_word",

                "search_analyzer": "ik_max_word"

            },

            "avatar": {

                "type": "keyword",

                "index": false

            }

        }

    },

    "settings": {

        "number_of_shards": 1,

        "number_of_replicas": 0,

        "analysis": {

            "analyzer": {

                "ik_smart_pinyin": {

                    "type": "custom",

                    "tokenizer": "ik_smart",

                    "filter": [

                        "my_pinyin",

                        "word_delimiter"

                    ]

                },

                "ik_max_word_pinyin": {

                    "type": "custom",

                    "tokenizer": "ik_max_word",

                    "filter": [

                        "my_pinyin",

                        "word_delimiter"

                    ]

                },

                "standard_pinyin": {

                    "type": "custom",

                    "tokenizer": "standard",

                    "filter": [

                        "my_pinyin",

                        "word_delimiter"

                    ]

                }

            },

            "filter": {

                "my_pinyin": {

                    "type": "pinyin",

                    "keep_full_pinyin": true,

                    "remove_duplicated_term": true,

                    "keep_separate_first_letter": true,

                    "limit_first_letter_length": 16,

                    "keep_original": true,

                    "lowercase": true

                }   

            }

        }

    }

}

是因为 "keep_original": true 这个配置项引起的，但是具体为什么不知道，去掉这个配置项或者改成false就好了！

要回复问题请先登录或注册

当内容中出现特殊字符比如 “-”的时候，拼音分词写入失败

1 个回复

发起人

活动推荐

相关问题

问题状态

当内容中出现特殊字符比如 “-”的时候，拼音分词写入失败

与内容相关的链接

1 个回复

发起人

活动推荐

相关问题

问题状态