我要搜索比如:北京希尔顿酒店、北京金茂威斯汀大饭店、北京乐多港万豪酒店。
当用户输入: 希 或 希尔 或 希尔顿 的时候都能收到。
那么我现在的思路就是:将 企业名称(希尔顿)配词, 并在建立索引的时候,把 北京希尔顿酒店 拆分成:
北、京、北京、希、尔、顿、希尔顿、酒、店、酒店。
分词思路:
1) 希尔顿配词
2) 先用ik_max_word 分词 (分词为)==> 北京、希尔顿、酒店
3) 使用es自带filter对 ik分的词进一步细化。 比如:ngram (但ngram并不理想)
问题: 我想问下大家,有没有更好的思路。 或者 有没有es自带的filter能达到这个效果?
参考:
{
"analysis":{
"filter":{
"pinyin_filter":{
"keep_joined_full_pinyin":"true",
"lowercase":"true",
"keep_original":"true",
"keep_none_chinese_together":"true",
"remove_duplicated_term":"true",
"keep_first_letter":"true",
"keep_separate_first_letter":"false",
"type":"pinyin",
"keep_none_chinese":"true",
"limit_first_letter_length":"16",
"keep_full_pinyin":"true"
},
"ngram_filter":{
"type":"ngram"
}
},
"analyzer":{
"ik_pinyin_analyzer":{
"filter":[
"lowercase",
"ngram_filter",
"pinyin_filter",
"unique"
],
"type":"custom",
"tokenizer":"ik_max_word"
},
"suggest_search_analyzer":{
"filter":[
"lowercase"
],
"tokenizer":"keyword"
},
"pinyin_analyzer":{
"tokenizer":"my_pinyin"
}
},
"tokenizer":{
"my_pinyin":{
"keep_joined_full_pinyin":"true",
"lowercase":"true",
"keep_none_chinese_in_joined_full_pinyin":"true",
"keep_original":"true",
"keep_none_chinese_together":"true",
"remove_duplicated_term":"true",
"keep_separate_first_letter":"false",
"type":"pinyin",
"keep_none_chinese":"false",
"limit_first_letter_length":"50",
"keep_full_pinyin":"false"
}
}
},
"number_of_shards":"1",
"number_of_replicas":"1"
}
当用户输入: 希 或 希尔 或 希尔顿 的时候都能收到。
那么我现在的思路就是:将 企业名称(希尔顿)配词, 并在建立索引的时候,把 北京希尔顿酒店 拆分成:
北、京、北京、希、尔、顿、希尔顿、酒、店、酒店。
分词思路:
1) 希尔顿配词
2) 先用ik_max_word 分词 (分词为)==> 北京、希尔顿、酒店
3) 使用es自带filter对 ik分的词进一步细化。 比如:ngram (但ngram并不理想)
问题: 我想问下大家,有没有更好的思路。 或者 有没有es自带的filter能达到这个效果?
参考:
{
"analysis":{
"filter":{
"pinyin_filter":{
"keep_joined_full_pinyin":"true",
"lowercase":"true",
"keep_original":"true",
"keep_none_chinese_together":"true",
"remove_duplicated_term":"true",
"keep_first_letter":"true",
"keep_separate_first_letter":"false",
"type":"pinyin",
"keep_none_chinese":"true",
"limit_first_letter_length":"16",
"keep_full_pinyin":"true"
},
"ngram_filter":{
"type":"ngram"
}
},
"analyzer":{
"ik_pinyin_analyzer":{
"filter":[
"lowercase",
"ngram_filter",
"pinyin_filter",
"unique"
],
"type":"custom",
"tokenizer":"ik_max_word"
},
"suggest_search_analyzer":{
"filter":[
"lowercase"
],
"tokenizer":"keyword"
},
"pinyin_analyzer":{
"tokenizer":"my_pinyin"
}
},
"tokenizer":{
"my_pinyin":{
"keep_joined_full_pinyin":"true",
"lowercase":"true",
"keep_none_chinese_in_joined_full_pinyin":"true",
"keep_original":"true",
"keep_none_chinese_together":"true",
"remove_duplicated_term":"true",
"keep_separate_first_letter":"false",
"type":"pinyin",
"keep_none_chinese":"false",
"limit_first_letter_length":"50",
"keep_full_pinyin":"false"
}
}
},
"number_of_shards":"1",
"number_of_replicas":"1"
}
1 个回复
rochy - rochy_he
赞同来自:
例如 酒店名称的mapping: