有老铁测试了es6.3.0的sql功能吗?

sql
我单机装了个6.3.0
执行的时候老出现如下错误,有老铁遇到过吗?
 
我是这么来启动的
./elasticsearch-sql-cli http://127.0.0.1:9200
sql> show tables;
name | type
----------------+---------------
hello |BASE TABLE

sql> select * from hello;
Server error [Server encountered an error [Cannot extract value [deliveraddress.address] from source]. [SqlIllegalArgumentException[Cannot extract value [deliveraddress.address] from source]
at org.elasticsearch.xpack.sql.execution.search.extractor.FieldHitExtractor.extractFromSource(FieldHitExtractor.java:139)
at org.elasticsearch.xpack.sql.execution.search.extractor.FieldHitExtractor.extract(FieldHitExtractor.java:95)
at org.elasticsearch.xpack.sql.execution.search.SearchHitRowSet.getColumn(SearchHitRowSet.java:114)
at org.elasticsearch.xpack.sql.session.AbstractRowSet.column(AbstractRowSet.java:18)
 
 
 
这是测试数据的mapping
{
"test2": {
"properties": {
"deliveraddress": {
"properties": {
"phone_no": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"default": {
"type": "boolean"
},
"address": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"province": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"city": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"mapping_id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"full_address": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"zip_code": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
},
"alipaywealth": {
"properties": {
"balance": {
"type": "long"
},
"total_quotient": {
"type": "long"
},
"huabei_creditamount": {
"type": "long"
},
"mapping_id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"huabei_totalcreditamount": {
"type": "long"
},
"total_profit": {
"type": "long"
}
}
},
"id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}

这是测试数据
{
"_id": "5b1cbc7935eb6e0007a154bb",
"deliveraddress": [
{
"phone_no": "13*******98",
"default": true,
"address": "江苏省无asdads市徐***镇",
"province": "江苏",
"city": "无锡",
"mapping_id": "3561511087asdasd341",
"name": "b***",
"full_address": "湖asd***上7号",
"zip_code": "214400"
},
{
"phone_no": "15*******70",
"default": false,
"address": "江苏省苏州asdasdasd张家港经济技术开发区",
"province": "江苏",
"city": "苏州",
"mapping_id": "3561511asdasd505341",
"name": "a**",
"full_address": "新asd路***德***",
"zip_code": "215600"
}
],
"alipaywealth": {
"balance": 0,
"total_quotient": 0,
"huabei_creditamount": 500,
"mapping_id": "3561511asdsa63505341",
"huabei_totalcreditamount": 500,
"total_profit": 0
}
}
 
 
---
 
初步怀疑是不是不支持嵌套,数组啥的呀
 
然后我就翻了翻源码,发现了这个
 
我的错误就是在最后一个else里出现的
 
仔细一看,发现这个地方循环只要走了两次,或者前面的条件不成立就肯定会抛这个异常,这怎么看上去像是有点问题呢
 
    @SuppressWarnings("unchecked")
Object extractFromSource(Map<String, Object> map) {
Object value = map;
boolean first = true;
// each node is a key inside the map
for (String node : path) {
if (value == null) {
return null;
} else if (first || value instanceof Map) {
first = false;
value = ((Map<String, Object>) value).get(node);
} else {
throw new SqlIllegalArgumentException("Cannot extract value [{}] from source", fieldName);
}
}
return unwrapMultiValue(value);
}
继续阅读 »
我单机装了个6.3.0
执行的时候老出现如下错误,有老铁遇到过吗?
 
我是这么来启动的
./elasticsearch-sql-cli http://127.0.0.1:9200
sql> show tables;
name | type
----------------+---------------
hello |BASE TABLE

sql> select * from hello;
Server error [Server encountered an error [Cannot extract value [deliveraddress.address] from source]. [SqlIllegalArgumentException[Cannot extract value [deliveraddress.address] from source]
at org.elasticsearch.xpack.sql.execution.search.extractor.FieldHitExtractor.extractFromSource(FieldHitExtractor.java:139)
at org.elasticsearch.xpack.sql.execution.search.extractor.FieldHitExtractor.extract(FieldHitExtractor.java:95)
at org.elasticsearch.xpack.sql.execution.search.SearchHitRowSet.getColumn(SearchHitRowSet.java:114)
at org.elasticsearch.xpack.sql.session.AbstractRowSet.column(AbstractRowSet.java:18)
 
 
 
这是测试数据的mapping
{
"test2": {
"properties": {
"deliveraddress": {
"properties": {
"phone_no": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"default": {
"type": "boolean"
},
"address": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"province": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"city": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"mapping_id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"full_address": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"zip_code": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
},
"alipaywealth": {
"properties": {
"balance": {
"type": "long"
},
"total_quotient": {
"type": "long"
},
"huabei_creditamount": {
"type": "long"
},
"mapping_id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"huabei_totalcreditamount": {
"type": "long"
},
"total_profit": {
"type": "long"
}
}
},
"id": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}

这是测试数据
{
"_id": "5b1cbc7935eb6e0007a154bb",
"deliveraddress": [
{
"phone_no": "13*******98",
"default": true,
"address": "江苏省无asdads市徐***镇",
"province": "江苏",
"city": "无锡",
"mapping_id": "3561511087asdasd341",
"name": "b***",
"full_address": "湖asd***上7号",
"zip_code": "214400"
},
{
"phone_no": "15*******70",
"default": false,
"address": "江苏省苏州asdasdasd张家港经济技术开发区",
"province": "江苏",
"city": "苏州",
"mapping_id": "3561511asdasd505341",
"name": "a**",
"full_address": "新asd路***德***",
"zip_code": "215600"
}
],
"alipaywealth": {
"balance": 0,
"total_quotient": 0,
"huabei_creditamount": 500,
"mapping_id": "3561511asdsa63505341",
"huabei_totalcreditamount": 500,
"total_profit": 0
}
}
 
 
---
 
初步怀疑是不是不支持嵌套,数组啥的呀
 
然后我就翻了翻源码,发现了这个
 
我的错误就是在最后一个else里出现的
 
仔细一看,发现这个地方循环只要走了两次,或者前面的条件不成立就肯定会抛这个异常,这怎么看上去像是有点问题呢
 
    @SuppressWarnings("unchecked")
Object extractFromSource(Map<String, Object> map) {
Object value = map;
boolean first = true;
// each node is a key inside the map
for (String node : path) {
if (value == null) {
return null;
} else if (first || value instanceof Map) {
first = false;
value = ((Map<String, Object>) value).get(node);
} else {
throw new SqlIllegalArgumentException("Cannot extract value [{}] from source", fieldName);
}
}
return unwrapMultiValue(value);
}
收起阅读 »

关于同义词检索方案的一点实践经验

最近一直在搞同义词搜索的问题,踩了一些坑,总结了一些经验,尤其是刚刚接触搜索和 ES,所以如果有不对的,或者不完备的地方也希望大家能提出改进意见。。。

下面是自己留下的文档记录:


需求

同义词检索也是搜索引擎必备的功能之一,例如,我们希望用户在搜索广东话的同时,也能找出和粤语有关的信息;用户在搜索苹果手机的同时,包含iPhone的内容也能被检索并呈现。

在现实生活中,相同语义的表述词汇往往有很多,而用户在检索的时候很难在一条 query 中将它们全部体现,所以识别和提供同义词检索显然可以获得更高的召回率。

需求剖析

在思考解决方案之前,我们不妨再来看看刚才提到的两个例子:

  1. 苹果手机iPhone
  2. 广东话粤语

我们先来看第一个,苹果手机iPhone

显然,这两个词是等价的,因为苹果公司发布的所有手机产品都叫 iPhone,而 iPhone 这个名字也没有被其他公司使用过。

于是,当用户搜索“苹果手机购买”的时候,我们也就有理由将它拆分成“苹果手机购买”和“iPhone购买”,分别进行检索,再将结果合并返回。


语言学中对这样的词组,称为是同义词中的相对同义词,或是等义词等义同义词。它们表达意思完全一致,在绝大多数语境中都可以相互替换,同时对上下文也不会产生影响。

这样的词组还有很多,例如:猫熊熊猫柚子文旦等等,这些等义词大抵来说有这样几种来源:

  1. 音节减缩形成:机枪机关枪坦克坦克车电扇电风扇
  2. 音译和音译形成:出租车的士维生素维他命
  3. 地域叫法不同,或新旧叫法:单车自行车脚踏车西红柿番茄马铃薯洋芋土豆黄瓜胡瓜
  4. 昵称代称:周杰伦周董
  5. 描述角度不同,学名方言差异:电脑计算机曲别针回形针

这些词组多以名词呈现,数量比较少,词组规模也较小,同时变化也很小。


接下来我们再来看第二组词:广东话粤语

广东话粤语这两个词代表的意思是相同的吗?它们也是可以相互替换的吗?

答案显然是否定的。

广东话从语义本身来说是一个比较粗糙的概念,它不仅指广东省内的粤语,还涵盖了潮汕话,客家话,雷州话等其他方言。而粤语却是一个非常严肃的概念,对语音语调都有非常详细的规定,不仅通用于广东省大部分地区,还包括广西、香港、澳门等地,甚至东南亚和北美。它们联系在于,大部分广东地区的人说的是粤语。

如此说来,给广东话粤语这样非常相似却又并不完全一致的词直接划上等号是有失偏颇的。当然,其实仔细考虑也不难发现,和广东话有相似之处却又不完全相同的词还有很多,例如:客家话广州话广府话等等。


语言学中把这样的词汇,称作是相对同义词,或是近义词。它们在意义上有一些相似之处,只能在特定的语境中进行替换。

它们的差别可能包括:

  1. 语义上:毁坏损坏(前者更严重),介绍说明(前者可以对人施加作用)
  2. 色彩上:团结勾结

对于这类相似却又不完全相同的近义词,在搜索的时候提供关联搜索是一个不错的方案。例如用户搜索“毁坏公物如何处罚”的时候,查询结果可以由90%的“损坏公物如何处罚”和10%的“毁坏公物如何处罚”查询结果合并后返回,从而获得更多的召回。

这些近义词以动词为主,不仅数量多,词组的规模也大,例如靠近的近义词可以是:靠拢逼近接近迫近等等。

解决思路

可替换的等义词问题中,我们可以直接使用 Elasticsearch 原生的 synonyms 功能来完成。虽说原生 synonyms 功能不支持热更新,而且需要将词典事先放进制定目录,不过好在这类等义词数量并不多,变化也并不大,尚且属于一劳永逸的任务。

对于不可直接替换的近义词问题,如果直接套用原生的 Synonyms 虽然可能会带来更多召回,但是查准率却骤降。

对于这类问题,我们期待的场景是,一旦发现用户 query 中的某个词有近义词,我们就将它拆分替换,成为多个 query 进行联合搜索。就像前面的例子:用户搜索“损坏公物如何处罚”的时候,查询结果可以由90%的“损坏公物如何处罚”和10%的“毁坏公物如何处罚”查询结果合并后返回。如此说来,使用 Elasticsearch 提供的 boosting_query 就成为了一个自然而然的想法。

不过稍加思考也不难发现,boosting_query 中 weight 的获得并不容易,也就是前面例子中的90%10%这组数字应该怎么设定,这也是近义词联合搜索中的重点。

先回到我们刚才的例子:当用户搜索“损坏公物如何处罚”的时候,我们本能地觉得用90%损坏10%毁坏合并在一起是“合理的”,这样的本能其实是来自于:我们主观地认为在检索“搞坏公物”这个事实的时候,90%的用户会使用毁坏来描述,10%的用户会使用损坏来描述。

简而言之,这组数字可以理解为用户群体描述同一个问题时,对词组选择的组成比例。再换个说法,也就是在当前这条 query 中,原词和近义词之间的可替换程度

再举一例,在“广东话入门”这条 query 中,从“表达学习语言”的语义上来看,广东话粤语差别并不大,这条 query 自然可以拆分成“广东话入门”和“粤语入门”,进行联合搜索,而且它们的 weight 甚至可以设置为 1:1 来获得更多合理的召回。

反过来,在“粤语歌曲推荐”这条 query 中,广东话的 weight 就需要慎重考虑,一方面是因为本身就没有“广东话歌曲”这种说法,另一方面也因为在广泛的语料中,歌曲广东话这两个词极少同时被提及。所以几遍是“粤语歌曲推荐”的拆分成分中有“广东话歌曲推荐”,weight 也需要被设置地非常低(倘若真的没有“粤语歌曲”相关的内容,推荐“广东话”的内容作为替补)。

说到这里,其实已经很明白了,语言模型是可以在这里被使用的,而语言模型的困惑度也与前面提到的 weight 一脉相承。

所以大致计算流程可以是:获得用户的query之后进行分词,在词组中寻找所有可能的同义词替换,将所有替换后的 query 分别放进语言模型中获得困惑度(或其他 metrics),依据它们来作为 boosting query 中的 weight。

graph TD;
广东话入门-->'广东话','入门';
'广东话','入门'-->_'广东话','入门'_;
'广东话','入门'-->_'粤语','入门'_;
_'广东话','入门'_-->语言模型;
_'粤语','入门'_-->语言模型;
语言模型-->PPL:0.65;
语言模型-->PPL:0.35;

对于这里语言模型的选择,可以使用传统的ngram,也可以使用双向的LSTM这样一些成熟的方案从语料中训练,也可以使用一些现成的方案:http://ai.baidu.com/tech/nlp/dnnlm_cn

继续阅读 »

最近一直在搞同义词搜索的问题,踩了一些坑,总结了一些经验,尤其是刚刚接触搜索和 ES,所以如果有不对的,或者不完备的地方也希望大家能提出改进意见。。。

下面是自己留下的文档记录:


需求

同义词检索也是搜索引擎必备的功能之一,例如,我们希望用户在搜索广东话的同时,也能找出和粤语有关的信息;用户在搜索苹果手机的同时,包含iPhone的内容也能被检索并呈现。

在现实生活中,相同语义的表述词汇往往有很多,而用户在检索的时候很难在一条 query 中将它们全部体现,所以识别和提供同义词检索显然可以获得更高的召回率。

需求剖析

在思考解决方案之前,我们不妨再来看看刚才提到的两个例子:

  1. 苹果手机iPhone
  2. 广东话粤语

我们先来看第一个,苹果手机iPhone

显然,这两个词是等价的,因为苹果公司发布的所有手机产品都叫 iPhone,而 iPhone 这个名字也没有被其他公司使用过。

于是,当用户搜索“苹果手机购买”的时候,我们也就有理由将它拆分成“苹果手机购买”和“iPhone购买”,分别进行检索,再将结果合并返回。


语言学中对这样的词组,称为是同义词中的相对同义词,或是等义词等义同义词。它们表达意思完全一致,在绝大多数语境中都可以相互替换,同时对上下文也不会产生影响。

这样的词组还有很多,例如:猫熊熊猫柚子文旦等等,这些等义词大抵来说有这样几种来源:

  1. 音节减缩形成:机枪机关枪坦克坦克车电扇电风扇
  2. 音译和音译形成:出租车的士维生素维他命
  3. 地域叫法不同,或新旧叫法:单车自行车脚踏车西红柿番茄马铃薯洋芋土豆黄瓜胡瓜
  4. 昵称代称:周杰伦周董
  5. 描述角度不同,学名方言差异:电脑计算机曲别针回形针

这些词组多以名词呈现,数量比较少,词组规模也较小,同时变化也很小。


接下来我们再来看第二组词:广东话粤语

广东话粤语这两个词代表的意思是相同的吗?它们也是可以相互替换的吗?

答案显然是否定的。

广东话从语义本身来说是一个比较粗糙的概念,它不仅指广东省内的粤语,还涵盖了潮汕话,客家话,雷州话等其他方言。而粤语却是一个非常严肃的概念,对语音语调都有非常详细的规定,不仅通用于广东省大部分地区,还包括广西、香港、澳门等地,甚至东南亚和北美。它们联系在于,大部分广东地区的人说的是粤语。

如此说来,给广东话粤语这样非常相似却又并不完全一致的词直接划上等号是有失偏颇的。当然,其实仔细考虑也不难发现,和广东话有相似之处却又不完全相同的词还有很多,例如:客家话广州话广府话等等。


语言学中把这样的词汇,称作是相对同义词,或是近义词。它们在意义上有一些相似之处,只能在特定的语境中进行替换。

它们的差别可能包括:

  1. 语义上:毁坏损坏(前者更严重),介绍说明(前者可以对人施加作用)
  2. 色彩上:团结勾结

对于这类相似却又不完全相同的近义词,在搜索的时候提供关联搜索是一个不错的方案。例如用户搜索“毁坏公物如何处罚”的时候,查询结果可以由90%的“损坏公物如何处罚”和10%的“毁坏公物如何处罚”查询结果合并后返回,从而获得更多的召回。

这些近义词以动词为主,不仅数量多,词组的规模也大,例如靠近的近义词可以是:靠拢逼近接近迫近等等。

解决思路

可替换的等义词问题中,我们可以直接使用 Elasticsearch 原生的 synonyms 功能来完成。虽说原生 synonyms 功能不支持热更新,而且需要将词典事先放进制定目录,不过好在这类等义词数量并不多,变化也并不大,尚且属于一劳永逸的任务。

对于不可直接替换的近义词问题,如果直接套用原生的 Synonyms 虽然可能会带来更多召回,但是查准率却骤降。

对于这类问题,我们期待的场景是,一旦发现用户 query 中的某个词有近义词,我们就将它拆分替换,成为多个 query 进行联合搜索。就像前面的例子:用户搜索“损坏公物如何处罚”的时候,查询结果可以由90%的“损坏公物如何处罚”和10%的“毁坏公物如何处罚”查询结果合并后返回。如此说来,使用 Elasticsearch 提供的 boosting_query 就成为了一个自然而然的想法。

不过稍加思考也不难发现,boosting_query 中 weight 的获得并不容易,也就是前面例子中的90%10%这组数字应该怎么设定,这也是近义词联合搜索中的重点。

先回到我们刚才的例子:当用户搜索“损坏公物如何处罚”的时候,我们本能地觉得用90%损坏10%毁坏合并在一起是“合理的”,这样的本能其实是来自于:我们主观地认为在检索“搞坏公物”这个事实的时候,90%的用户会使用毁坏来描述,10%的用户会使用损坏来描述。

简而言之,这组数字可以理解为用户群体描述同一个问题时,对词组选择的组成比例。再换个说法,也就是在当前这条 query 中,原词和近义词之间的可替换程度

再举一例,在“广东话入门”这条 query 中,从“表达学习语言”的语义上来看,广东话粤语差别并不大,这条 query 自然可以拆分成“广东话入门”和“粤语入门”,进行联合搜索,而且它们的 weight 甚至可以设置为 1:1 来获得更多合理的召回。

反过来,在“粤语歌曲推荐”这条 query 中,广东话的 weight 就需要慎重考虑,一方面是因为本身就没有“广东话歌曲”这种说法,另一方面也因为在广泛的语料中,歌曲广东话这两个词极少同时被提及。所以几遍是“粤语歌曲推荐”的拆分成分中有“广东话歌曲推荐”,weight 也需要被设置地非常低(倘若真的没有“粤语歌曲”相关的内容,推荐“广东话”的内容作为替补)。

说到这里,其实已经很明白了,语言模型是可以在这里被使用的,而语言模型的困惑度也与前面提到的 weight 一脉相承。

所以大致计算流程可以是:获得用户的query之后进行分词,在词组中寻找所有可能的同义词替换,将所有替换后的 query 分别放进语言模型中获得困惑度(或其他 metrics),依据它们来作为 boosting query 中的 weight。

graph TD;
广东话入门-->'广东话','入门';
'广东话','入门'-->_'广东话','入门'_;
'广东话','入门'-->_'粤语','入门'_;
_'广东话','入门'_-->语言模型;
_'粤语','入门'_-->语言模型;
语言模型-->PPL:0.65;
语言模型-->PPL:0.35;

对于这里语言模型的选择,可以使用传统的ngram,也可以使用双向的LSTM这样一些成熟的方案从语料中训练,也可以使用一些现成的方案:http://ai.baidu.com/tech/nlp/dnnlm_cn

收起阅读 »

ES5.3聚合内存溢出bug

有以下DSL

{
  "size" : 0,
  "query" : { },
  "_source" : false,
  "aggregations" : {
    "aggData" : {
      "terms" : {
        "field" : "url",
        "size" : 200,
        "min_doc_count" : 1,
        "shard_min_doc_count" : 0,
        "show_term_doc_count_error" : false,
        "order" : [
          {
            "PV" : "desc"
          }
        ]
      },
      "aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }
    }
  }
}

目的是对用户访问的URL进行分组统计,按独立用户数来排序。 执行后,data节点频繁FGC,内存无法回收,随即OOM,然后data节点脱离,集群变为red。 最初以为是cardinality精度问题导致内存使用过多,随即将precision_threshold设置为100,再次执行,内存使用量确实少了很多,但是还是用到GB级别。为了确认是否是cardinality问题,去掉外层聚合,直接执行

"aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }

发现响应非常快,而且内存占用只有KB级别。 再次单独执行外部聚合,发现也非常快,于是猜测是order导致,将order去掉,果然,如丝般顺滑,再也没有OOM。 为了解决这种OOM,首先想到的是熔断器。默认indices.breaker.request.limit配置是60%。改成10%后,触发熔断,集群正常,但是多点几次之后,data还是出现OOM了。 于是逐步调试,发现每执行1次,内存就增加一点,熔断返回后并没有被回收,直到OOM。基本确定是这里的order导致内存泄露了。 就在此时,同事反馈在5.6不会有这个问题,于是去查release note,果然在5.5的版本发现fix了这个问题。问题描述。 这个bug的根本原因是:

terms aggregations at the root level use the global_ordinals execution hint by default.
When all sub-aggregators can be run in breadth_first mode the collected buckets for these sub-aggs are dense (remapped after the initial pruning).
But if a sub-aggregator is not deferrable and needs to collect all buckets before pruning we don't remap global ords and the aggregator needs to deal with sparse buckets.
Most (if not all) aggregators expect dense buckets and uses this information to allocate memories.
This change forces the remap of the global ordinals but only when there is at least one sub-aggregator that cannot be deferred.

解决方案: 1,升级到5.5以上版本;

2,DSL增加"execution_hint":"map",属性。

{
  "size" : 0,
  "query" : { },
  "_source" : false,
  "aggregations" : {
    "aggData" : {
      "terms" : {
        "field" : "url",
        "size" : 200,
"execution_hint":"map",
        "min_doc_count" : 1,
        "shard_min_doc_count" : 0,
        "show_term_doc_count_error" : false,
        "order" : [
          {
            "PV" : "desc"
          }
        ]
      },
      "aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }
    }
  }
}
继续阅读 »

有以下DSL

{
  "size" : 0,
  "query" : { },
  "_source" : false,
  "aggregations" : {
    "aggData" : {
      "terms" : {
        "field" : "url",
        "size" : 200,
        "min_doc_count" : 1,
        "shard_min_doc_count" : 0,
        "show_term_doc_count_error" : false,
        "order" : [
          {
            "PV" : "desc"
          }
        ]
      },
      "aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }
    }
  }
}

目的是对用户访问的URL进行分组统计,按独立用户数来排序。 执行后,data节点频繁FGC,内存无法回收,随即OOM,然后data节点脱离,集群变为red。 最初以为是cardinality精度问题导致内存使用过多,随即将precision_threshold设置为100,再次执行,内存使用量确实少了很多,但是还是用到GB级别。为了确认是否是cardinality问题,去掉外层聚合,直接执行

"aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }

发现响应非常快,而且内存占用只有KB级别。 再次单独执行外部聚合,发现也非常快,于是猜测是order导致,将order去掉,果然,如丝般顺滑,再也没有OOM。 为了解决这种OOM,首先想到的是熔断器。默认indices.breaker.request.limit配置是60%。改成10%后,触发熔断,集群正常,但是多点几次之后,data还是出现OOM了。 于是逐步调试,发现每执行1次,内存就增加一点,熔断返回后并没有被回收,直到OOM。基本确定是这里的order导致内存泄露了。 就在此时,同事反馈在5.6不会有这个问题,于是去查release note,果然在5.5的版本发现fix了这个问题。问题描述。 这个bug的根本原因是:

terms aggregations at the root level use the global_ordinals execution hint by default.
When all sub-aggregators can be run in breadth_first mode the collected buckets for these sub-aggs are dense (remapped after the initial pruning).
But if a sub-aggregator is not deferrable and needs to collect all buckets before pruning we don't remap global ords and the aggregator needs to deal with sparse buckets.
Most (if not all) aggregators expect dense buckets and uses this information to allocate memories.
This change forces the remap of the global ordinals but only when there is at least one sub-aggregator that cannot be deferred.

解决方案: 1,升级到5.5以上版本;

2,DSL增加"execution_hint":"map",属性。

{
  "size" : 0,
  "query" : { },
  "_source" : false,
  "aggregations" : {
    "aggData" : {
      "terms" : {
        "field" : "url",
        "size" : 200,
"execution_hint":"map",
        "min_doc_count" : 1,
        "shard_min_doc_count" : 0,
        "show_term_doc_count_error" : false,
        "order" : [
          {
            "PV" : "desc"
          }
        ]
      },
      "aggregations" : {
        "PV" : {
          "cardinality" : {
            "field" : "userssid"
          }
        }
      }
    }
  }
}
收起阅读 »

highlight 返回来的title 不全 看截图


360截图20180609164613963.jpg

$params = [
'index' => 'soso_*',
'type' => 'links_1',
'body' => [
'query' => [
'match' => [
'title' => $kw
]
],
'highlight' => [
'pre_tags' => '<em>',
'post_tags' => '</em>',
'fields' => [
'title' => new \stdClass()
]
],
]
];
继续阅读 »

360截图20180609164613963.jpg

$params = [
'index' => 'soso_*',
'type' => 'links_1',
'body' => [
'query' => [
'match' => [
'title' => $kw
]
],
'highlight' => [
'pre_tags' => '<em>',
'post_tags' => '</em>',
'fields' => [
'title' => new \stdClass()
]
],
]
];
收起阅读 »

Elastic日报 第289期 (2018-06-01)

1、Elasticsearch snapshot 备份的使用方法
https://elasticsearch.cn/article/648
2、ElasticSearch + Canal 开发千万级的实时搜索系统
http://t.cn/R8vjBwD
3、【线下活动】2018-06-30 南京Elastic Meetup日程安排
https://elasticsearch.cn/article/647

编辑:铭毅天下
归档:https://elasticsearch.cn/article/649
订阅:https://tinyletter.com/elastic-daily
 
继续阅读 »
1、Elasticsearch snapshot 备份的使用方法
https://elasticsearch.cn/article/648
2、ElasticSearch + Canal 开发千万级的实时搜索系统
http://t.cn/R8vjBwD
3、【线下活动】2018-06-30 南京Elastic Meetup日程安排
https://elasticsearch.cn/article/647

编辑:铭毅天下
归档:https://elasticsearch.cn/article/649
订阅:https://tinyletter.com/elastic-daily
  收起阅读 »

Elasticsearch snapshot 备份的使用方法

常见的数据库都会提供备份的机制,以解决在数据库无法使用的情况下,可以开启新的实例,然后通过备份来恢复数据减少损失。虽然 Elasticsearch 有良好的容灾性,但由于以下原因,其依然需要备份机制。

  1. 数据灾备。在整个集群无法正常工作时,可以及时从备份中恢复数据。
  2. 归档数据。随着数据的积累,比如日志类的数据,集群的存储压力会越来越大,不管是内存还是磁盘都要承担数据增多带来的压力,此时我们往往会选择只保留最近一段时间的数据,比如1个月,而将1个月之前的数据删除。如果你不想删除这些数据,以备后续有查看的需求,那么你就可以将这些数据以备份的形式归档。
  3. 迁移数据。当你需要将数据从一个集群迁移到另一个集群时,也可以用备份的方式来实现。

Elasticsearch 做备份有两种方式,一是将数据导出成文本文件,比如通过 elasticdumpesm 等工具将存储在 Elasticsearch 中的数据导出到文件中。二是以备份 elasticsearch data 目录中文件的形式来做快照,也就是 Elasticsearch 中 snapshot 接口实现的功能。第一种方式相对简单,在数据量小的时候比较实用,当应对大数据量场景效率就大打折扣。我们今天就着重讲解下第二种备份的方式,即 snapshot api 的使用。

备份要解决备份到哪里、如何备份、何时备份和如何恢复的问题,那么我们接下来一个个解决。

1. 备份到哪里

在 Elasticsearch 中通过 repository 定义备份存储类型和位置,存储类型有共享文件系统、AWS 的 S3存储、HDFS、微软 Azure的存储、Google Cloud 的存储等,当然你也可以自己写代码实现国内阿里云的存储。我们这里以最简单的共享文件系统为例,你也可以在本地做实验。

首先,你要在 elasticsearch.yml 的配置文件中注明可以用作备份路径 path.repo ,如下所示:

path.repo: ["/mount/backups", "/mount/longterm_backups"]

配置好后,就可以使用 snapshot api 来创建一个 repository 了,如下我们创建一个名为 my_backup 的 repository。

PUT /_snapshot/my_backup
{
  "type": "fs",
  "settings": {
    "location": "/mount/backups/my_backup"
  }
}

之后我们就可以在这个 repository 中来备份数据了。

2. 如何备份

有了 repostiroy 后,我们就可以做备份了,也叫快照,也就是记录当下数据的状态。如下所示我们创建一个名为 snapshot_1 的快照。

PUT /_snapshot/my_backup/snapshot_1?wait_for_completion=true

wait_for_completion 为 true 是指该 api 在备份执行完毕后再返回结果,否则默认是异步执行的,我们这里为了立刻看到效果,所以设置了该参数,线上执行时不用设置该参数,让其在后台异步执行即可。

执行成功后会返回如下结果,用于说明备份的情况:

{
  "snapshots": [
    {
      "snapshot": "snapshot_1",
      "uuid": "52Lr4aFuQYGjMEv5ZFeFEg",
      "version_id": 6030099,
      "version": "6.3.0",
      "indices": [
        ".monitoring-kibana-6-2018.05.30",
        ".monitoring-es-6-2018.05.28",
        ".watcher-history-7-2018.05.30",
        ".monitoring-beats-6-2018.05.29",
        "metricbeat-6.2.4-2018.05.28",
        ".monitoring-alerts-6",
        "metricbeat-6.2.4-2018.05.30"
      ],
      "include_global_state": true,
      "state": "SUCCESS",
      "start_time": "2018-05-31T12:45:57.492Z",
      "start_time_in_millis": 1527770757492,
      "end_time": "2018-05-31T12:46:15.214Z",
      "end_time_in_millis": 1527770775214,
      "duration_in_millis": 17722,
      "failures": [],
      "shards": {
        "total": 28,
        "failed": 0,
        "successful": 28
      }
    }
  ]
}

返回结果的参数意义都是比较直观的,比如 indices 指明此次备份涉及到的索引名称,由于我们没有指定需要备份的索引,这里备份了所有索引;state 指明状态;duration_in_millis 指明备份任务执行时长等。

我们可以通过 GET _snapshot/my_backup/snapshot_1获取 snapshot_1 的执行状态。

此时如果去 /mount/backups/my_backup 查看,会发现里面多了很多文件,这些文件其实都是基于 elasticsearch data 目录中的文件生成的压缩存储的备份文件。大家可以通过 du -sh . 命令看一下该目录的大小,方便后续做对比。

3. 何时备份

通过上面的步骤我们成功创建了一个备份,但随着数据的新增,我们需要对新增的数据也做备份,那么我们如何做呢?方法很简单,只要再创建一个快照 snapshot_2 就可以了。

PUT /_snapshot/my_backup/snapshot_2?wait_for_completion=true

当执行完毕后,你会发现 /mount/backups/my_backup 体积变大了。这说明新数据备份进来了。要说明的一点是,当你在同一个 repository 中做多次 snapshot 时,elasticsearch 会检查要备份的数据 segment 文件是否有变化,如果没有变化则不处理,否则只会把发生变化的 segment file 备份下来。这其实就实现了增量备份。

elasticsearch 的资深用户应该了解 force merge 功能,即可以强行将一个索引的 segment file 合并成指定数目,这里要注意的是如果你主动调用 force merge api,那么 snapshot 功能的增量备份功能就失效了,因为 api 调用完毕后,数据目录中的所有 segment file 都发生变化了。

另一个就是备份时机的问题,虽然 snapshot 不会占用太多的 cpu、磁盘和网络资源,但还是建议大家尽量在闲时做备份。

4. 如何恢复

所谓“养兵千日,用兵一时”,我们该演练下备份的成果,将其恢复出来。通过调用如下 api 即可快速实现恢复功能。

POST /_snapshot/my_backup/snapshot_1/_restore?wait_for_completion=true
{
  "indices": "index_1",
  "rename_replacement": "restored_index_1"
}

通过上面的 api,我们可以将 index_1 索引恢复到 restored_index_1 中。这个恢复过程完全是基于文件的,因此效率会比较高。

虽然我们这里演示的是在同一个集群做备份与恢复,你也可以在另一个集群上连接该 repository 做恢复。我们这里就不做说明了。

5. 其他

由于 Elasticsearch 版本更新比较快,因此大家在做备份与恢复的时候,要注意版本问题,同一个大版本之间的备份与恢复是没有问题的,比如都是 5.1 和 5.6 之间可以互相备份恢复。但你不能把一个高版本的备份在低版本恢复,比如将 6.x 的备份在 5.x 中恢复。而低版本备份在高版本恢复有一定要求:

1) 5.x 可以在 6.x 恢复

2) 2.x 可以在 5.x 恢复

3) 1.x 可以在 2.x 恢复

其他跨大版本的升级都是不可用的,比如1.x 的无法在 5.x 恢复。这里主要原因还是 Lucene 版本问题导致的,每一次 ES 的大版本升级都会伴随 Lucene 的大版本,而 Lucene 的版本是尽量保证向前兼容,即新版可以读旧版的文件,但版本跨越太多,无法实现兼容的情况也在所难免了。

6. 继续学习

本文只是简单对 snapshot 功能做了一个演示,希望这足够引起你的兴趣。如果你想进一步深入的了解该功能,比如备份的时候如何指定部分索引、如何查询备份和还原的进度、如何跨集群恢复数据、如何备份到 HDFS 等,可以详细阅读官方手册https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-snapshots.html,如果在使用的过程中遇到了问题,欢迎留言讨论。

继续阅读 »

常见的数据库都会提供备份的机制,以解决在数据库无法使用的情况下,可以开启新的实例,然后通过备份来恢复数据减少损失。虽然 Elasticsearch 有良好的容灾性,但由于以下原因,其依然需要备份机制。

  1. 数据灾备。在整个集群无法正常工作时,可以及时从备份中恢复数据。
  2. 归档数据。随着数据的积累,比如日志类的数据,集群的存储压力会越来越大,不管是内存还是磁盘都要承担数据增多带来的压力,此时我们往往会选择只保留最近一段时间的数据,比如1个月,而将1个月之前的数据删除。如果你不想删除这些数据,以备后续有查看的需求,那么你就可以将这些数据以备份的形式归档。
  3. 迁移数据。当你需要将数据从一个集群迁移到另一个集群时,也可以用备份的方式来实现。

Elasticsearch 做备份有两种方式,一是将数据导出成文本文件,比如通过 elasticdumpesm 等工具将存储在 Elasticsearch 中的数据导出到文件中。二是以备份 elasticsearch data 目录中文件的形式来做快照,也就是 Elasticsearch 中 snapshot 接口实现的功能。第一种方式相对简单,在数据量小的时候比较实用,当应对大数据量场景效率就大打折扣。我们今天就着重讲解下第二种备份的方式,即 snapshot api 的使用。

备份要解决备份到哪里、如何备份、何时备份和如何恢复的问题,那么我们接下来一个个解决。

1. 备份到哪里

在 Elasticsearch 中通过 repository 定义备份存储类型和位置,存储类型有共享文件系统、AWS 的 S3存储、HDFS、微软 Azure的存储、Google Cloud 的存储等,当然你也可以自己写代码实现国内阿里云的存储。我们这里以最简单的共享文件系统为例,你也可以在本地做实验。

首先,你要在 elasticsearch.yml 的配置文件中注明可以用作备份路径 path.repo ,如下所示:

path.repo: ["/mount/backups", "/mount/longterm_backups"]

配置好后,就可以使用 snapshot api 来创建一个 repository 了,如下我们创建一个名为 my_backup 的 repository。

PUT /_snapshot/my_backup
{
  "type": "fs",
  "settings": {
    "location": "/mount/backups/my_backup"
  }
}

之后我们就可以在这个 repository 中来备份数据了。

2. 如何备份

有了 repostiroy 后,我们就可以做备份了,也叫快照,也就是记录当下数据的状态。如下所示我们创建一个名为 snapshot_1 的快照。

PUT /_snapshot/my_backup/snapshot_1?wait_for_completion=true

wait_for_completion 为 true 是指该 api 在备份执行完毕后再返回结果,否则默认是异步执行的,我们这里为了立刻看到效果,所以设置了该参数,线上执行时不用设置该参数,让其在后台异步执行即可。

执行成功后会返回如下结果,用于说明备份的情况:

{
  "snapshots": [
    {
      "snapshot": "snapshot_1",
      "uuid": "52Lr4aFuQYGjMEv5ZFeFEg",
      "version_id": 6030099,
      "version": "6.3.0",
      "indices": [
        ".monitoring-kibana-6-2018.05.30",
        ".monitoring-es-6-2018.05.28",
        ".watcher-history-7-2018.05.30",
        ".monitoring-beats-6-2018.05.29",
        "metricbeat-6.2.4-2018.05.28",
        ".monitoring-alerts-6",
        "metricbeat-6.2.4-2018.05.30"
      ],
      "include_global_state": true,
      "state": "SUCCESS",
      "start_time": "2018-05-31T12:45:57.492Z",
      "start_time_in_millis": 1527770757492,
      "end_time": "2018-05-31T12:46:15.214Z",
      "end_time_in_millis": 1527770775214,
      "duration_in_millis": 17722,
      "failures": [],
      "shards": {
        "total": 28,
        "failed": 0,
        "successful": 28
      }
    }
  ]
}

返回结果的参数意义都是比较直观的,比如 indices 指明此次备份涉及到的索引名称,由于我们没有指定需要备份的索引,这里备份了所有索引;state 指明状态;duration_in_millis 指明备份任务执行时长等。

我们可以通过 GET _snapshot/my_backup/snapshot_1获取 snapshot_1 的执行状态。

此时如果去 /mount/backups/my_backup 查看,会发现里面多了很多文件,这些文件其实都是基于 elasticsearch data 目录中的文件生成的压缩存储的备份文件。大家可以通过 du -sh . 命令看一下该目录的大小,方便后续做对比。

3. 何时备份

通过上面的步骤我们成功创建了一个备份,但随着数据的新增,我们需要对新增的数据也做备份,那么我们如何做呢?方法很简单,只要再创建一个快照 snapshot_2 就可以了。

PUT /_snapshot/my_backup/snapshot_2?wait_for_completion=true

当执行完毕后,你会发现 /mount/backups/my_backup 体积变大了。这说明新数据备份进来了。要说明的一点是,当你在同一个 repository 中做多次 snapshot 时,elasticsearch 会检查要备份的数据 segment 文件是否有变化,如果没有变化则不处理,否则只会把发生变化的 segment file 备份下来。这其实就实现了增量备份。

elasticsearch 的资深用户应该了解 force merge 功能,即可以强行将一个索引的 segment file 合并成指定数目,这里要注意的是如果你主动调用 force merge api,那么 snapshot 功能的增量备份功能就失效了,因为 api 调用完毕后,数据目录中的所有 segment file 都发生变化了。

另一个就是备份时机的问题,虽然 snapshot 不会占用太多的 cpu、磁盘和网络资源,但还是建议大家尽量在闲时做备份。

4. 如何恢复

所谓“养兵千日,用兵一时”,我们该演练下备份的成果,将其恢复出来。通过调用如下 api 即可快速实现恢复功能。

POST /_snapshot/my_backup/snapshot_1/_restore?wait_for_completion=true
{
  "indices": "index_1",
  "rename_replacement": "restored_index_1"
}

通过上面的 api,我们可以将 index_1 索引恢复到 restored_index_1 中。这个恢复过程完全是基于文件的,因此效率会比较高。

虽然我们这里演示的是在同一个集群做备份与恢复,你也可以在另一个集群上连接该 repository 做恢复。我们这里就不做说明了。

5. 其他

由于 Elasticsearch 版本更新比较快,因此大家在做备份与恢复的时候,要注意版本问题,同一个大版本之间的备份与恢复是没有问题的,比如都是 5.1 和 5.6 之间可以互相备份恢复。但你不能把一个高版本的备份在低版本恢复,比如将 6.x 的备份在 5.x 中恢复。而低版本备份在高版本恢复有一定要求:

1) 5.x 可以在 6.x 恢复

2) 2.x 可以在 5.x 恢复

3) 1.x 可以在 2.x 恢复

其他跨大版本的升级都是不可用的,比如1.x 的无法在 5.x 恢复。这里主要原因还是 Lucene 版本问题导致的,每一次 ES 的大版本升级都会伴随 Lucene 的大版本,而 Lucene 的版本是尽量保证向前兼容,即新版可以读旧版的文件,但版本跨越太多,无法实现兼容的情况也在所难免了。

6. 继续学习

本文只是简单对 snapshot 功能做了一个演示,希望这足够引起你的兴趣。如果你想进一步深入的了解该功能,比如备份的时候如何指定部分索引、如何查询备份和还原的进度、如何跨集群恢复数据、如何备份到 HDFS 等,可以详细阅读官方手册https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-snapshots.html,如果在使用的过程中遇到了问题,欢迎留言讨论。

收起阅读 »

转载一篇关于shard数量设计的文章,很赞

How many shards should I have in my Elasticsearch cluster?
 
Elasticsearch is a very versatile platform, that supports a variety of use cases, and provides great flexibility around data organisation and replication strategies. This flexibility can however sometimes make it hard to determine up-front how to best organize your data into indices and shards, especially if you are new to the Elastic Stack. While suboptimal choices  will not necessarily cause problems when first starting out, they have the potential to cause performance problems as data volumes grow over time. The more data the cluster holds, the more difficult it also becomes to correct the problem, as reindexing of large amounts of data can sometimes be required.

When we come across users that are experiencing performance problems, it is not uncommon that this can be traced back to issues around how data is indexed and number of shards in the cluster. This is especially true for use-cases involving multi-tenancy and/or use of time-based indices. When discussing this with users, either in person at events or meetings or via our forum, some of the most common questions are “How many shards should I have?” and “How large should my shards be?”.

This blog post aims to help you answer these questions and provide practical guidelines for use cases that involve the use of time-based indices, e.g. logging or security analytics, in a single place.

What is a shard?

Before we start, we need to establish some facts and terminology that we will need in later sections.

Data in Elasticsearch is organized into indices. Each index is made up of one or more shards. Each shard is an instance of a Lucene index, which you can think of as a self-contained search engine that indexes and handles queries for a subset of the data in an Elasticsearch cluster.

As data is written to a shard, it is periodically published into new immutable Lucene segments on disk, and it is at this time it becomes available for querying. This is referred to as a refresh. How this works is described in greater detail in Elasticsearch: the Definitive Guide.

As the number of segments grow, these are periodically consolidated into larger segments. This process is referred to as merging. As all segments are immutable, this means that the disk space used will typically fluctuate during indexing, as new, merged segments need to be created before the ones they replace can be deleted. Merging can be quite resource intensive, especially with respect to disk I/O.

The shard is the unit at which Elasticsearch distributes data around the cluster. The speed at which Elasticsearch can move shards around when rebalancing data, e.g. following a failure, will depend on the size and number of shards as well as network and disk performance.

TIP: Avoid having very large shards as this can negatively affect the cluster's ability to recover from failure. There is no fixed limit on how large shards can be, but a shard size of 50GB is often quoted as a limit that has been seen to work for a variety of use-cases.

Index by retention period

As segments are immutable, updating a document requires Elasticsearch to first find the existing document, then mark it as deleted and add the updated version. Deleting a document also requires the document to be found and marked as deleted. For this reason, deleted documents will continue to tie up disk space and some system resources until they are merged out, which can consume a lot of system resources.

Elasticsearch allows complete indices to be deleted very efficiently directly from the file system, without explicitly having to delete all records individually. This is by far the most efficient way to delete data from Elasticsearch.

TIP: Try to use time-based indices for managing data retention whenever possible. Group data into indices based on the retention period. Time-based indices also make it easy to vary the number of primary shards and replicas over time, as this can be changed for the next index to be generated. This simplifies adapting to changing data volumes and requirements.

Are indices and shards not free?

For each Elasticsearch index, information about mappings and state is stored in the cluster state. This is kept in memory for fast access. Having a large number of indices in a cluster can therefore result in a large cluster state, especially if mappings are large. This can become slow to update as all updates need to be done through a single thread in order to guarantee consistency before the changes are distributed across the cluster.

TIP: In order to reduce the number of indices and avoid large and sprawling mappings, consider storing data with similar structure in the same index rather than splitting into separate indices based on where the data comes from. It is important to find a good balance between the number of indices and the mapping size for each individual index.

Each shard has data that need to be kept in memory and use heap space. This includes data structures holding information at the shard level, but also at the segment level in order to define where data reside on disk. The size of these data structures is not fixed and will vary depending on the use-case.

One important characteristic of the segment related overhead is however that it is not strictly proportional to the size of the segment. This means that larger segments have less overhead per data volume compared to smaller segments. The difference can be substantial.

In order to be able to store as much data as possible per node, it becomes important to manage heap usage and reduce the amount of overhead as much as possible. The more heap space a node has, the more data and shards it can handle.

Indices and shards are therefore not free from a cluster perspective, as there is some level of resource overhead for each index and shard.

TIP: Small shards result in small segments, which increases overhead. Aim to keep the average shard size between a few GB and a few tens of GB. For use-cases with time-based data, it is common to see shards between 20GB and 40GB in size.

TIP: As the overhead per shard depends on the segment count and size, forcing smaller segments to merge into larger ones through a forcemerge operation can reduce overhead and improve query performance. This should ideally be done once no more data is written to the index. Be aware that this is an expensive operation that should ideally be performed during off-peak hours.

TIP: The number of shards you can hold on a node will be proportional to the amount of heap you have available, but there is no fixed limit enforced by Elasticsearch. A good rule-of-thumb is to ensure you keep the number of shards per node below 20 to 25 per GB heap it has configured. A node with a 30GB heap should therefore have a maximum of 600-750 shards, but the further below this limit you can keep it the better. This will generally help the cluster stay in good health.

How does shard size affect performance?

In Elasticsearch, each query is executed in a single thread per shard. Multiple shards can however be processed in parallel, as can multiple queries and aggregations against the same shard.

This means that the minimum query latency, when no caching is involved, will depend on the data, the type of query, as well as the size of the shard. Querying lots of small shards will make the processing per shard faster, but as many more tasks need to be queued up and processed in sequence, it is not necessarily going to be faster than querying a smaller number of larger shards. Having lots of small shards can also reduce the query throughput if there are multiple concurrent queries.

TIP: The best way to determine the maximum shard size from a query performance perspective is to benchmark using realistic data and queries. Always benchmark with a query and indexing load representative of what the node would need to handle in production, as optimizing for a single query might give misleading results.

How do I manage shard size?

When using time-based indices, each index has traditionally been associated with a fixed time period. Daily indices are very common, and often used for holding data with short retention period or large daily volumes. These allow retention period to be managed with good granularity and makes it easy to adjust for changing volumes on a daily basis. Data with a longer retention period, especially if the daily volumes do not warrant the use of daily indices, often use weekly or monthly induces in order to keep the shard size up. This reduces the number of indices and shards that need to be stored in the cluster over time.

TIP: If using time-based indices covering a fixed period, adjust the period each index covers based on the retention period and expected data volumes in order to reach the target shard size.

Time-based indices with a fixed time interval works well when data volumes are reasonably predictable and change slowly. If the indexing rate can vary quickly, it is very difficult to maintain a uniform target shard size.

In order to be able to better handle this type of scenarios, the Rollover and Shrink APIs were introduced. These add a lot of flexibility to how indices and shards are managed, specifically for time-based indices.

The rollover index API makes it possible to specify the number of documents and index should contain and/or the maximum period documents should be written to it. Once one of these criteria has been exceeded, Elasticsearch can trigger a new index to be created for writing without downtime. Instead of having each index cover a specific time-period, it is now possible to switch to a new index at a specific size, which makes it possible to more easily achieve an even shard size for all indices.

In cases where data might be updated, there is no longer a distinct link between the timestamp of the event and the index it resides in when using this API, which may make updates significantly less efficient as each update my need to be preceded by a search.

TIP: If you have time-based, immutable data where volumes can vary significantly over time, consider using the rollover index API to achieve an optimal target shard size by dynamically varying the time-period each index covers. This gives great flexibility and can help avoid having too large or too small shards when volumes are unpredictable.


The shrink index API allows you to shrink an existing index into a new index with fewer primary shards. If an even spread of shards across nodes is desired during indexing, but this will result in too small shards, this API can be used to reduce the number of primary shards once the index is no longer indexed into. This will result in larger shards, better suited for longer term storage of data.


TIP: If you need to have each index cover a specific time period but still want to be able to spread indexing out across a large number of nodes, consider using the shrink API to reduce the number of primary shards once the index is no longer indexed into. This API can also be used to reduce the number of shards in case you have initially configured too many shards.


Conclusions

This blog post has provided tips and practical guidelines around how to best manage data in Elasticsearch. If you are interested in learning more, "Elasticsearch: the definitive guide" contains a section about designing for scale, which is well worth reading even though it is a bit old.

A lot of the decisions around how to best distribute your data across indices and shards will however depend on the use-case specifics, and it can sometimes be hard to determine how to best apply the advice available. For more in-depth and personal advice you can engage with us commercially through a subscription and let our Support and Consulting teams help accelerate your project. If you are happy to discuss your use-case in the open, you can also get help from our community and through our public forum.
继续阅读 »
How many shards should I have in my Elasticsearch cluster?
 
Elasticsearch is a very versatile platform, that supports a variety of use cases, and provides great flexibility around data organisation and replication strategies. This flexibility can however sometimes make it hard to determine up-front how to best organize your data into indices and shards, especially if you are new to the Elastic Stack. While suboptimal choices  will not necessarily cause problems when first starting out, they have the potential to cause performance problems as data volumes grow over time. The more data the cluster holds, the more difficult it also becomes to correct the problem, as reindexing of large amounts of data can sometimes be required.

When we come across users that are experiencing performance problems, it is not uncommon that this can be traced back to issues around how data is indexed and number of shards in the cluster. This is especially true for use-cases involving multi-tenancy and/or use of time-based indices. When discussing this with users, either in person at events or meetings or via our forum, some of the most common questions are “How many shards should I have?” and “How large should my shards be?”.

This blog post aims to help you answer these questions and provide practical guidelines for use cases that involve the use of time-based indices, e.g. logging or security analytics, in a single place.

What is a shard?

Before we start, we need to establish some facts and terminology that we will need in later sections.

Data in Elasticsearch is organized into indices. Each index is made up of one or more shards. Each shard is an instance of a Lucene index, which you can think of as a self-contained search engine that indexes and handles queries for a subset of the data in an Elasticsearch cluster.

As data is written to a shard, it is periodically published into new immutable Lucene segments on disk, and it is at this time it becomes available for querying. This is referred to as a refresh. How this works is described in greater detail in Elasticsearch: the Definitive Guide.

As the number of segments grow, these are periodically consolidated into larger segments. This process is referred to as merging. As all segments are immutable, this means that the disk space used will typically fluctuate during indexing, as new, merged segments need to be created before the ones they replace can be deleted. Merging can be quite resource intensive, especially with respect to disk I/O.

The shard is the unit at which Elasticsearch distributes data around the cluster. The speed at which Elasticsearch can move shards around when rebalancing data, e.g. following a failure, will depend on the size and number of shards as well as network and disk performance.

TIP: Avoid having very large shards as this can negatively affect the cluster's ability to recover from failure. There is no fixed limit on how large shards can be, but a shard size of 50GB is often quoted as a limit that has been seen to work for a variety of use-cases.

Index by retention period

As segments are immutable, updating a document requires Elasticsearch to first find the existing document, then mark it as deleted and add the updated version. Deleting a document also requires the document to be found and marked as deleted. For this reason, deleted documents will continue to tie up disk space and some system resources until they are merged out, which can consume a lot of system resources.

Elasticsearch allows complete indices to be deleted very efficiently directly from the file system, without explicitly having to delete all records individually. This is by far the most efficient way to delete data from Elasticsearch.

TIP: Try to use time-based indices for managing data retention whenever possible. Group data into indices based on the retention period. Time-based indices also make it easy to vary the number of primary shards and replicas over time, as this can be changed for the next index to be generated. This simplifies adapting to changing data volumes and requirements.

Are indices and shards not free?

For each Elasticsearch index, information about mappings and state is stored in the cluster state. This is kept in memory for fast access. Having a large number of indices in a cluster can therefore result in a large cluster state, especially if mappings are large. This can become slow to update as all updates need to be done through a single thread in order to guarantee consistency before the changes are distributed across the cluster.

TIP: In order to reduce the number of indices and avoid large and sprawling mappings, consider storing data with similar structure in the same index rather than splitting into separate indices based on where the data comes from. It is important to find a good balance between the number of indices and the mapping size for each individual index.

Each shard has data that need to be kept in memory and use heap space. This includes data structures holding information at the shard level, but also at the segment level in order to define where data reside on disk. The size of these data structures is not fixed and will vary depending on the use-case.

One important characteristic of the segment related overhead is however that it is not strictly proportional to the size of the segment. This means that larger segments have less overhead per data volume compared to smaller segments. The difference can be substantial.

In order to be able to store as much data as possible per node, it becomes important to manage heap usage and reduce the amount of overhead as much as possible. The more heap space a node has, the more data and shards it can handle.

Indices and shards are therefore not free from a cluster perspective, as there is some level of resource overhead for each index and shard.

TIP: Small shards result in small segments, which increases overhead. Aim to keep the average shard size between a few GB and a few tens of GB. For use-cases with time-based data, it is common to see shards between 20GB and 40GB in size.

TIP: As the overhead per shard depends on the segment count and size, forcing smaller segments to merge into larger ones through a forcemerge operation can reduce overhead and improve query performance. This should ideally be done once no more data is written to the index. Be aware that this is an expensive operation that should ideally be performed during off-peak hours.

TIP: The number of shards you can hold on a node will be proportional to the amount of heap you have available, but there is no fixed limit enforced by Elasticsearch. A good rule-of-thumb is to ensure you keep the number of shards per node below 20 to 25 per GB heap it has configured. A node with a 30GB heap should therefore have a maximum of 600-750 shards, but the further below this limit you can keep it the better. This will generally help the cluster stay in good health.

How does shard size affect performance?

In Elasticsearch, each query is executed in a single thread per shard. Multiple shards can however be processed in parallel, as can multiple queries and aggregations against the same shard.

This means that the minimum query latency, when no caching is involved, will depend on the data, the type of query, as well as the size of the shard. Querying lots of small shards will make the processing per shard faster, but as many more tasks need to be queued up and processed in sequence, it is not necessarily going to be faster than querying a smaller number of larger shards. Having lots of small shards can also reduce the query throughput if there are multiple concurrent queries.

TIP: The best way to determine the maximum shard size from a query performance perspective is to benchmark using realistic data and queries. Always benchmark with a query and indexing load representative of what the node would need to handle in production, as optimizing for a single query might give misleading results.

How do I manage shard size?

When using time-based indices, each index has traditionally been associated with a fixed time period. Daily indices are very common, and often used for holding data with short retention period or large daily volumes. These allow retention period to be managed with good granularity and makes it easy to adjust for changing volumes on a daily basis. Data with a longer retention period, especially if the daily volumes do not warrant the use of daily indices, often use weekly or monthly induces in order to keep the shard size up. This reduces the number of indices and shards that need to be stored in the cluster over time.

TIP: If using time-based indices covering a fixed period, adjust the period each index covers based on the retention period and expected data volumes in order to reach the target shard size.

Time-based indices with a fixed time interval works well when data volumes are reasonably predictable and change slowly. If the indexing rate can vary quickly, it is very difficult to maintain a uniform target shard size.

In order to be able to better handle this type of scenarios, the Rollover and Shrink APIs were introduced. These add a lot of flexibility to how indices and shards are managed, specifically for time-based indices.

The rollover index API makes it possible to specify the number of documents and index should contain and/or the maximum period documents should be written to it. Once one of these criteria has been exceeded, Elasticsearch can trigger a new index to be created for writing without downtime. Instead of having each index cover a specific time-period, it is now possible to switch to a new index at a specific size, which makes it possible to more easily achieve an even shard size for all indices.

In cases where data might be updated, there is no longer a distinct link between the timestamp of the event and the index it resides in when using this API, which may make updates significantly less efficient as each update my need to be preceded by a search.

TIP: If you have time-based, immutable data where volumes can vary significantly over time, consider using the rollover index API to achieve an optimal target shard size by dynamically varying the time-period each index covers. This gives great flexibility and can help avoid having too large or too small shards when volumes are unpredictable.


The shrink index API allows you to shrink an existing index into a new index with fewer primary shards. If an even spread of shards across nodes is desired during indexing, but this will result in too small shards, this API can be used to reduce the number of primary shards once the index is no longer indexed into. This will result in larger shards, better suited for longer term storage of data.


TIP: If you need to have each index cover a specific time period but still want to be able to spread indexing out across a large number of nodes, consider using the shrink API to reduce the number of primary shards once the index is no longer indexed into. This API can also be used to reduce the number of shards in case you have initially configured too many shards.


Conclusions

This blog post has provided tips and practical guidelines around how to best manage data in Elasticsearch. If you are interested in learning more, "Elasticsearch: the definitive guide" contains a section about designing for scale, which is well worth reading even though it is a bit old.

A lot of the decisions around how to best distribute your data across indices and shards will however depend on the use-case specifics, and it can sometimes be hard to determine how to best apply the advice available. For more in-depth and personal advice you can engage with us commercially through a subscription and let our Support and Consulting teams help accelerate your project. If you are happy to discuss your use-case in the open, you can also get help from our community and through our public forum. 收起阅读 »

[技术交流] Elasticsearch冉冉升起,几款开源数据引擎对比

2018年的数据引擎排名已经出啦,不知道各位小伙伴留意没~~~
排名.png

Elasticsearch强势进入前十,是一颗冉冉升起的新星。
 
一些小伙伴经常会碰到选取何种数据引擎的情况。在此,我们将把几个热门的开源数据引擎进行对比,供大家参考。
对比.png


从上表看出:
MySQL:将数据保存在不同的表中,使用SQL语言进行交互,是目前非常流行的关系型数据库管理系统。如果您需要一个传统的数据库,那么MySQL是不错的选择。
MongoDB:基于分布式文件存储的NoSQL数据库,数据结构由键值对组成,具有可扩展性。如果您需要存储和查询非结构化信息,不太需要分析或全文检索,那么MongoDB是不错的选择。
Redis:高性能的键值对数据库,支持一定程度的事务性。主要应用为缓存,对读写有非常高性能要求的场景中,是不错的选择。但因为是靠全内存加速,所以数据量大的情况下,配置要求也很高。
Elasticsearch:基于Lucene的分布式搜索引擎,具有全文检索,同义词处理,相关度排名,复杂数据分析的能力。如果您想做文本类检索,及相关性排序,以及指标类分析,Elasticsearch会非常适合。它在文档全文检索(网站搜索,APP搜索)和日志分析(运营,运维)领域拥有得天独厚的优势。
 
此文抛砖引玉,欢迎小伙伴留言讨论不同数据引擎的适用场景~~

另有想快速体验Elasticsearch欢迎戳下面链接~~~
华为云搜索服务是云上的Elasticsearch,具有简单易用、无忧运维、弹性灵活、数据可靠等特点,欢迎使用~~~~
 
继续阅读 »
2018年的数据引擎排名已经出啦,不知道各位小伙伴留意没~~~
排名.png

Elasticsearch强势进入前十,是一颗冉冉升起的新星。
 
一些小伙伴经常会碰到选取何种数据引擎的情况。在此,我们将把几个热门的开源数据引擎进行对比,供大家参考。
对比.png


从上表看出:
MySQL:将数据保存在不同的表中,使用SQL语言进行交互,是目前非常流行的关系型数据库管理系统。如果您需要一个传统的数据库,那么MySQL是不错的选择。
MongoDB:基于分布式文件存储的NoSQL数据库,数据结构由键值对组成,具有可扩展性。如果您需要存储和查询非结构化信息,不太需要分析或全文检索,那么MongoDB是不错的选择。
Redis:高性能的键值对数据库,支持一定程度的事务性。主要应用为缓存,对读写有非常高性能要求的场景中,是不错的选择。但因为是靠全内存加速,所以数据量大的情况下,配置要求也很高。
Elasticsearch:基于Lucene的分布式搜索引擎,具有全文检索,同义词处理,相关度排名,复杂数据分析的能力。如果您想做文本类检索,及相关性排序,以及指标类分析,Elasticsearch会非常适合。它在文档全文检索(网站搜索,APP搜索)和日志分析(运营,运维)领域拥有得天独厚的优势。
 
此文抛砖引玉,欢迎小伙伴留言讨论不同数据引擎的适用场景~~

另有想快速体验Elasticsearch欢迎戳下面链接~~~
华为云搜索服务是云上的Elasticsearch,具有简单易用、无忧运维、弹性灵活、数据可靠等特点,欢迎使用~~~~
  收起阅读 »

干货 | Elasticsearch 布道者Medcl对话携程Wood大叔核心笔记

Elastic Podcast 第二期来啦, 这一次我们来到了位于上海的携程旅行网,携程内部大量运用了 Elasticsearch来进行集中式的运维日志管理和为业务部门提供统一的搜索服务平台, 目前线上总共部署了多达 94 个 Elasticsearch 集群和超过 700 多个 Elasticsearch 节点,每天新增日志 1600 亿条,峰值达到 300 万每秒,存放在 Elasticsearch里面的索引文档达到 2.5 万亿,磁盘存储达到 PB 级。 想知道携程是如何应对这些海量数据下的挑战,以及最佳实践,让我们一起来收听这一期的 Podcast,跟随携程的两位技术负责人吴晓刚和胡航来一探究竟。

音频地址:http://m.ximalaya.com/111156131/sound/89571047

主持人:Elastic 技术布道师,曾勇(Medcl)。 嘉宾: 1、吴晓刚(Wood大叔),携程技术保障部系统研发总监, Elasticsearch 国内早期实践者,中文社区活跃用户。 曾在 eBay, Morgan Stanley, PPTV 等国内外公司从事系统软件研发、系统集成与技术支持工作。对于大规模 IT 系统的运维自动化、可视化、性能优化具有浓厚的兴趣。在技术方面一直抱有知其然知其所以然的态度。

2、胡航,携程旅行网高级技术经理,负责相关搜索实现、SOA服务的开发。曾供职于腾讯、盛大等公司,对新技术持有强烈的好奇心,目前关注于 Elasticsearch 的业务实现、JVM 性能优化等。

1、携程Elasticsearch使用历史

1.1 运维组Wood大叔:

2014年,ES0.9版本。 选型对比:MongoDB——数据量级大了以后,出现性能瓶颈。 调研后,选型:ELK(Elasticsearch、Logstash、Kibana)。 实现效果:实时看效果、查询、聚合。

1.2 胡航业务组:

业务场景:酒店价格。 选型依据:ES分布式、可调试性能好。 版本:ES2.3。 时间:2017年中,逐步转向ES,5.3版本。 效果:显著。专注于后端开发,业务交由业务团队自己去做。

2、携程Elasticsearch规模

2.1 运维组Wood大叔:

集群:94个。最小三个节点,最大:360+节点。 节点:700+。 每日增量:1600亿条。 峰值:300W/s。 总数据量:2.5万亿,PB数量级。 面对挑战: 1)实时写入。 2)业务流程相关,几个月-2年的历史数据。

2.2 胡航业务组:

业务场景:3集群,每集群6个节点。 单个索引:最大1000W-2000W。

关注:ES基础框架,帮业务部分实现写入、查询、DSL调优。 查询:3000-4000/s。

携程ES规模量全国数一数二,有很大挑战。

3、携程Elasticsearch淌过的坑

3.1 运维组Wood大叔:

3.1.1 痛点1:内存溢出。

原因:早期版本,对查询限制做的不充分;数据量上了规模,查询、聚合会非常耗内存。

升级版本后,ES做了很多处理,集群层面做了限制。越来越稳定。

3.1.2 痛点2:集群故障无法恢复。

3.1.3 痛点3:translog做不完。

3.1.4 痛点4:集群的平台化管理。

需要:研究底层工作机制,找到规避方法。 经验丰富后,运维效率提升。

3.2胡航业务组:

3.2.1 痛点1:ES基础不熟悉带来的问题;

3.2.2 痛点2:性能问题——最终排查是ES5.X keyword类型的原因。

4、架构

4.1运维组Wood大叔:

1、早期:ELK+redis(中间缓存)

挑战: 1)redis承受能力差。

2)redis单线程。

改善: 1)redis改为kafka(磁盘级别),数据畅通了。

2)Logstash内存消耗大。——改为:logstash forward,推荐官方Beats。

3)数据规模后,需要很多服务器,Logstash非常耗内存。

优化:用golang开发了一个gohangout (https://github.com/childe/gohangout ) ,

内存比java 版的hangout(https://github.com/childe/hangout) 内存大幅降低。

4.2 胡航搜索业务组:

1)单点集群压力瓶颈。

改为:业务数据导入ES,提供定制客户端。

2)搜索平台,接入更多业务需求。

不方便在kibana做定制开发,自己做了简单网站查询数据、监控,满足业务的贴切需求。

5、ES6.3最新特性(抢先看)

5.1 ES6.3 支持Sql接口

Wood大叔: kibana看DSL,拷贝后修改。新用户不熟悉,会不方便。

BI部分也需要,类似sql的查询。

优点:更简单,发挥更大的作用。

携程BI部门——应用场景:搜索的关键词、 统计热词,目的地等信息。

Kibana满足不了需求,就要写代码。如果有了sql,会非常快的开发。

胡航搜索业务组: 写DSL,还是稍微复杂。借助 NLPChina ElasticsearchSql插件实现。

实际应用发现插件还是有问题,期待ES官方推出Sql查询。

5.2 增加kibana丰富表现力

5.3 更快的索引速度

refresh优化:提升吞吐。

6、ELK Stack最喜欢的特性

Wood大叔: 丰富的扩展能力,用户不必关心底层的实现。通过服务器增加节点,方便大数据量查询。

胡航: ES可视化、可调试特性。 举例: 1)出现问题排查DSL是不是合适?Mapping是不是合适?

2)相信ES的社区,不必关心底层,更多的时间做业务(解放双手)。

3)ES中做好数据模型,实现业务需求。

7、ELK Stack最需要完善的

Wood大叔: 1)集群的保护待更进一步完善 数据丢失后的处理?

节点损毁后的处理?

目的:减轻运维的负担;

2)甄别坏查询,Slow log存在缺陷。 很难判定真正故障是哪个慢查询。

集群发下故障的时候,有API实时分析会更好(比单纯查slow log)。

胡航: 1)ES坑还很多,比较耗费时间。

2)期待社区对常见问题整理。

3)期待官方总结完善的向导,类似:Cookbook。

初级上手的话可以参考借鉴(大家缺乏经验)

8、初学者的建议

1)初学者必读——《Elasticsearch: 权威指南》(英文、中文) WOOD大叔至少看了3遍。

2)不断的实践。

3)带着问题,再去找文档,构建知识体系。

4)多参与社区,尝试理解和解决社区问题,不断学习,以提升自己。 互帮互助,共同成长!

5)中文社区的小建议:问题精华版收集——新手通读,学习前人经验。

9、如何看待Elasticsearch在国内的发展?

1)参与和贡献,国内做的不足;

2)中文分词插件等,如:分词质量要求高,专业语义搜索支持(提高搜索相关性等)、情感标注、NLP等。

3)在中文应用场景应用更加丰富。

4)社区问题比较分散,社区需要意见领袖,加强某领域讨论、深入交流等。

5)medcl:ElasticTips成立,大家都去参与,以图片形式分享。

6) 社区还会做更多的事情,大家多分享、互相交流。

10、小结

非常震惊,wood大叔看了3遍《Elasticsearch: 权威指南》,我们没有理由不努力。

共勉,加油!

继续阅读 »

Elastic Podcast 第二期来啦, 这一次我们来到了位于上海的携程旅行网,携程内部大量运用了 Elasticsearch来进行集中式的运维日志管理和为业务部门提供统一的搜索服务平台, 目前线上总共部署了多达 94 个 Elasticsearch 集群和超过 700 多个 Elasticsearch 节点,每天新增日志 1600 亿条,峰值达到 300 万每秒,存放在 Elasticsearch里面的索引文档达到 2.5 万亿,磁盘存储达到 PB 级。 想知道携程是如何应对这些海量数据下的挑战,以及最佳实践,让我们一起来收听这一期的 Podcast,跟随携程的两位技术负责人吴晓刚和胡航来一探究竟。

音频地址:http://m.ximalaya.com/111156131/sound/89571047

主持人:Elastic 技术布道师,曾勇(Medcl)。 嘉宾: 1、吴晓刚(Wood大叔),携程技术保障部系统研发总监, Elasticsearch 国内早期实践者,中文社区活跃用户。 曾在 eBay, Morgan Stanley, PPTV 等国内外公司从事系统软件研发、系统集成与技术支持工作。对于大规模 IT 系统的运维自动化、可视化、性能优化具有浓厚的兴趣。在技术方面一直抱有知其然知其所以然的态度。

2、胡航,携程旅行网高级技术经理,负责相关搜索实现、SOA服务的开发。曾供职于腾讯、盛大等公司,对新技术持有强烈的好奇心,目前关注于 Elasticsearch 的业务实现、JVM 性能优化等。

1、携程Elasticsearch使用历史

1.1 运维组Wood大叔:

2014年,ES0.9版本。 选型对比:MongoDB——数据量级大了以后,出现性能瓶颈。 调研后,选型:ELK(Elasticsearch、Logstash、Kibana)。 实现效果:实时看效果、查询、聚合。

1.2 胡航业务组:

业务场景:酒店价格。 选型依据:ES分布式、可调试性能好。 版本:ES2.3。 时间:2017年中,逐步转向ES,5.3版本。 效果:显著。专注于后端开发,业务交由业务团队自己去做。

2、携程Elasticsearch规模

2.1 运维组Wood大叔:

集群:94个。最小三个节点,最大:360+节点。 节点:700+。 每日增量:1600亿条。 峰值:300W/s。 总数据量:2.5万亿,PB数量级。 面对挑战: 1)实时写入。 2)业务流程相关,几个月-2年的历史数据。

2.2 胡航业务组:

业务场景:3集群,每集群6个节点。 单个索引:最大1000W-2000W。

关注:ES基础框架,帮业务部分实现写入、查询、DSL调优。 查询:3000-4000/s。

携程ES规模量全国数一数二,有很大挑战。

3、携程Elasticsearch淌过的坑

3.1 运维组Wood大叔:

3.1.1 痛点1:内存溢出。

原因:早期版本,对查询限制做的不充分;数据量上了规模,查询、聚合会非常耗内存。

升级版本后,ES做了很多处理,集群层面做了限制。越来越稳定。

3.1.2 痛点2:集群故障无法恢复。

3.1.3 痛点3:translog做不完。

3.1.4 痛点4:集群的平台化管理。

需要:研究底层工作机制,找到规避方法。 经验丰富后,运维效率提升。

3.2胡航业务组:

3.2.1 痛点1:ES基础不熟悉带来的问题;

3.2.2 痛点2:性能问题——最终排查是ES5.X keyword类型的原因。

4、架构

4.1运维组Wood大叔:

1、早期:ELK+redis(中间缓存)

挑战: 1)redis承受能力差。

2)redis单线程。

改善: 1)redis改为kafka(磁盘级别),数据畅通了。

2)Logstash内存消耗大。——改为:logstash forward,推荐官方Beats。

3)数据规模后,需要很多服务器,Logstash非常耗内存。

优化:用golang开发了一个gohangout (https://github.com/childe/gohangout ) ,

内存比java 版的hangout(https://github.com/childe/hangout) 内存大幅降低。

4.2 胡航搜索业务组:

1)单点集群压力瓶颈。

改为:业务数据导入ES,提供定制客户端。

2)搜索平台,接入更多业务需求。

不方便在kibana做定制开发,自己做了简单网站查询数据、监控,满足业务的贴切需求。

5、ES6.3最新特性(抢先看)

5.1 ES6.3 支持Sql接口

Wood大叔: kibana看DSL,拷贝后修改。新用户不熟悉,会不方便。

BI部分也需要,类似sql的查询。

优点:更简单,发挥更大的作用。

携程BI部门——应用场景:搜索的关键词、 统计热词,目的地等信息。

Kibana满足不了需求,就要写代码。如果有了sql,会非常快的开发。

胡航搜索业务组: 写DSL,还是稍微复杂。借助 NLPChina ElasticsearchSql插件实现。

实际应用发现插件还是有问题,期待ES官方推出Sql查询。

5.2 增加kibana丰富表现力

5.3 更快的索引速度

refresh优化:提升吞吐。

6、ELK Stack最喜欢的特性

Wood大叔: 丰富的扩展能力,用户不必关心底层的实现。通过服务器增加节点,方便大数据量查询。

胡航: ES可视化、可调试特性。 举例: 1)出现问题排查DSL是不是合适?Mapping是不是合适?

2)相信ES的社区,不必关心底层,更多的时间做业务(解放双手)。

3)ES中做好数据模型,实现业务需求。

7、ELK Stack最需要完善的

Wood大叔: 1)集群的保护待更进一步完善 数据丢失后的处理?

节点损毁后的处理?

目的:减轻运维的负担;

2)甄别坏查询,Slow log存在缺陷。 很难判定真正故障是哪个慢查询。

集群发下故障的时候,有API实时分析会更好(比单纯查slow log)。

胡航: 1)ES坑还很多,比较耗费时间。

2)期待社区对常见问题整理。

3)期待官方总结完善的向导,类似:Cookbook。

初级上手的话可以参考借鉴(大家缺乏经验)

8、初学者的建议

1)初学者必读——《Elasticsearch: 权威指南》(英文、中文) WOOD大叔至少看了3遍。

2)不断的实践。

3)带着问题,再去找文档,构建知识体系。

4)多参与社区,尝试理解和解决社区问题,不断学习,以提升自己。 互帮互助,共同成长!

5)中文社区的小建议:问题精华版收集——新手通读,学习前人经验。

9、如何看待Elasticsearch在国内的发展?

1)参与和贡献,国内做的不足;

2)中文分词插件等,如:分词质量要求高,专业语义搜索支持(提高搜索相关性等)、情感标注、NLP等。

3)在中文应用场景应用更加丰富。

4)社区问题比较分散,社区需要意见领袖,加强某领域讨论、深入交流等。

5)medcl:ElasticTips成立,大家都去参与,以图片形式分享。

6) 社区还会做更多的事情,大家多分享、互相交流。

10、小结

非常震惊,wood大叔看了3遍《Elasticsearch: 权威指南》,我们没有理由不努力。

共勉,加油!

收起阅读 »

Elasticsearch如何实现 SQL语句中 Group By 和 Limit 的功能

有 SQL 背景的同学在学习 Elasticsearch 时,面对一个查询需求,不由自主地会先思考如何用 SQL 来实现,然后再去想 Elasticsearch 的 Query DSL 如何实现。那么本篇就给大家讲一条常见的 SQL 语句如何用 Elasticsearch 的查询语言实现。

1. SQL语句

假设我们有一个汽车的数据集,每个汽车都有车型、颜色等字段,我希望获取颜色种类大于1个的前2车型。假设汽车的数据模型如下:

{
    "model":"modelA",
    "color":"red"
}

假设我们有一个 cars 表,通过如下语句创建测试数据。

INSERT INTO cars (model,color) VALUES ('A','red'); 
INSERT INTO cars (model,color) VALUES ('A','white'); 
INSERT INTO cars (model,color) VALUES ('A','black'); 
INSERT INTO cars (model,color) VALUES ('A','yellow'); 
INSERT INTO cars (model,color) VALUES ('B','red'); 
INSERT INTO cars (model,color) VALUES ('B','white'); 
INSERT INTO cars (model,color) VALUES ('C','black'); 
INSERT INTO cars (model,color) VALUES ('C','red'); 
INSERT INTO cars (model,color) VALUES ('C','white'); 
INSERT INTO cars (model,color) VALUES ('C','yellow'); 
INSERT INTO cars (model,color) VALUES ('C','blue'); 
INSERT INTO cars (model,color) VALUES ('D','red');
INSERT INTO cars (model,color) VALUES ('A','red'); 

那么实现我们需求的 SQL 语句也比较简单,实现如下:

SELECT model,COUNT(DISTINCT color) color_count FROM cars GROUP BY model HAVING color_count > 1 ORDER BY color_count desc LIMIT 2;

这条查询语句中 Group By 是按照 model 做分组, Having color_count>1 限定了车型颜色种类大于1,ORDER BY color_count desc 限定结果按照颜色种类倒序排列,而 LIMIT 2 限定只返回前3条数据。

那么在 Elasticsearch 中如何实现这个需求呢?

2. 在 Elasticsearch 模拟测试数据

首先我们需要先在 elasticsearch 中插入测试的数据,这里我们使用 bulk 接口 ,如下所示:

POST _bulk
{"index":{"_index":"cars","_type":"doc","_id":"1"}}
{"model":"A","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"2"}}
{"model":"A","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"3"}}
{"model":"A","color":"black"}
{"index":{"_index":"cars","_type":"doc","_id":"4"}}
{"model":"A","color":"yellow"}
{"index":{"_index":"cars","_type":"doc","_id":"5"}}
{"model":"B","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"6"}}
{"model":"B","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"7"}}
{"model":"C","color":"black"}
{"index":{"_index":"cars","_type":"doc","_id":"8"}}
{"model":"C","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"9"}}
{"model":"C","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"10"}}
{"model":"C","color":"yellow"}
{"index":{"_index":"cars","_type":"doc","_id":"11"}}
{"model":"C","color":"blue"}
{"index":{"_index":"cars","_type":"doc","_id":"12"}}
{"model":"D","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"13"}}
{"model":"A","color":"red"}

其中 index 为 cars,type 为 doc,所有数据与mysql 数据保持一致。大家可以在 Kibana 的 Dev Tools 中执行上面的命令,然后执行下面的查询语句验证数据是否已经成功存入。

GET cars/_search

3. Group By VS Terms/Metric Aggregation

SQL 中 Group By 语句在 Elasticsearch 中对应的是 Terms Aggregation,即分桶聚合,对应 Group By color 的语句如下所示:

GET cars/_search
{
  "size":0,
  "aggs":{
    "models":{
      "terms":{
        "field":"model.keyword"
      }
    }
  }
}

结果如下:

{
  "took": 161,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5
        },
        {
          "key": "C",
          "doc_count": 5
        },
        {
          "key": "B",
          "doc_count": 2
        },
        {
          "key": "D",
          "doc_count": 1
        }
      ]
    }
  }
}

我们看 aggregations 这个 key 下面的即为返回结果。

SQL 语句中还有一项是 COUNT(DISTINCT color) color_count 用于计算每个 model 的颜色数,在 Elasticsearch 中我们需要使用一个指标类聚合 Cardinality ,进行不同值计数。语句如下:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        }
      }
    }
  }
}

其返回结果如下:

{
  "took": 74,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        },
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "B",
          "doc_count": 2,
          "color_count": {
            "value": 2
          }
        },
        {
          "key": "D",
          "doc_count": 1,
          "color_count": {
            "value": 1
          }
        }
      ]
    }
  }
}

结果中 color_count 即为每个 model 的颜色数,但这里所有的模型都返回了,我们只想要颜色数大于1的模型,因此这里还要加一个过滤条件。

4. Having Condition VS Bucket Filter Aggregation

Having color_count > 1 在 Elasticsearch 中对应的是 Bucket Filter 聚合,语句如下所示:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        },
        "color_count_filter": {
          "bucket_selector": {
            "buckets_path": {
              "colorCount": "color_count"
            },
            "script": "params.colorCount>1"
          }
        }
      }
    }
  }
}

返回结果如下:

{
  "took": 39,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        },
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "B",
          "doc_count": 2,
          "color_count": {
            "value": 2
          }
        }
      ]
    }
  }
}

此时返回结果只包含颜色数大于1的模型,但大家会发现颜色数多的 C 不是在第一个位置,我们还需要做排序处理。

5. Order By Limit VS Bucket Sort Aggregation

ORDER BY color_count desc LIMIT 3 在 Elasticsearch 中可以使用 Bucket Sort 聚合实现,语句如下所示:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        },
        "color_count_filter": {
          "bucket_selector": {
            "buckets_path": {
              "colorCount": "color_count"
            },
            "script": "params.colorCount>1"
          }
        },
        "color_count_sort": {
          "bucket_sort": {
            "sort": {
              "color_count": "desc"
            },
            "size": 2
          }
        }
      }
    }
  }
}

返回结果如下:

{
  "took": 32,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        }
      ]
    }
  }
}

至此我们便将 SQL 语句实现的功能用 Elasticsearch 查询语句实现了。对比 SQL 语句与 Elasticsearch 的查询语句,大家会发现后者复杂了很多,但并非无章可循,随着大家对常见语法越来越熟悉,相信一定会越写越得心应手!

继续阅读 »

有 SQL 背景的同学在学习 Elasticsearch 时,面对一个查询需求,不由自主地会先思考如何用 SQL 来实现,然后再去想 Elasticsearch 的 Query DSL 如何实现。那么本篇就给大家讲一条常见的 SQL 语句如何用 Elasticsearch 的查询语言实现。

1. SQL语句

假设我们有一个汽车的数据集,每个汽车都有车型、颜色等字段,我希望获取颜色种类大于1个的前2车型。假设汽车的数据模型如下:

{
    "model":"modelA",
    "color":"red"
}

假设我们有一个 cars 表,通过如下语句创建测试数据。

INSERT INTO cars (model,color) VALUES ('A','red'); 
INSERT INTO cars (model,color) VALUES ('A','white'); 
INSERT INTO cars (model,color) VALUES ('A','black'); 
INSERT INTO cars (model,color) VALUES ('A','yellow'); 
INSERT INTO cars (model,color) VALUES ('B','red'); 
INSERT INTO cars (model,color) VALUES ('B','white'); 
INSERT INTO cars (model,color) VALUES ('C','black'); 
INSERT INTO cars (model,color) VALUES ('C','red'); 
INSERT INTO cars (model,color) VALUES ('C','white'); 
INSERT INTO cars (model,color) VALUES ('C','yellow'); 
INSERT INTO cars (model,color) VALUES ('C','blue'); 
INSERT INTO cars (model,color) VALUES ('D','red');
INSERT INTO cars (model,color) VALUES ('A','red'); 

那么实现我们需求的 SQL 语句也比较简单,实现如下:

SELECT model,COUNT(DISTINCT color) color_count FROM cars GROUP BY model HAVING color_count > 1 ORDER BY color_count desc LIMIT 2;

这条查询语句中 Group By 是按照 model 做分组, Having color_count>1 限定了车型颜色种类大于1,ORDER BY color_count desc 限定结果按照颜色种类倒序排列,而 LIMIT 2 限定只返回前3条数据。

那么在 Elasticsearch 中如何实现这个需求呢?

2. 在 Elasticsearch 模拟测试数据

首先我们需要先在 elasticsearch 中插入测试的数据,这里我们使用 bulk 接口 ,如下所示:

POST _bulk
{"index":{"_index":"cars","_type":"doc","_id":"1"}}
{"model":"A","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"2"}}
{"model":"A","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"3"}}
{"model":"A","color":"black"}
{"index":{"_index":"cars","_type":"doc","_id":"4"}}
{"model":"A","color":"yellow"}
{"index":{"_index":"cars","_type":"doc","_id":"5"}}
{"model":"B","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"6"}}
{"model":"B","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"7"}}
{"model":"C","color":"black"}
{"index":{"_index":"cars","_type":"doc","_id":"8"}}
{"model":"C","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"9"}}
{"model":"C","color":"white"}
{"index":{"_index":"cars","_type":"doc","_id":"10"}}
{"model":"C","color":"yellow"}
{"index":{"_index":"cars","_type":"doc","_id":"11"}}
{"model":"C","color":"blue"}
{"index":{"_index":"cars","_type":"doc","_id":"12"}}
{"model":"D","color":"red"}
{"index":{"_index":"cars","_type":"doc","_id":"13"}}
{"model":"A","color":"red"}

其中 index 为 cars,type 为 doc,所有数据与mysql 数据保持一致。大家可以在 Kibana 的 Dev Tools 中执行上面的命令,然后执行下面的查询语句验证数据是否已经成功存入。

GET cars/_search

3. Group By VS Terms/Metric Aggregation

SQL 中 Group By 语句在 Elasticsearch 中对应的是 Terms Aggregation,即分桶聚合,对应 Group By color 的语句如下所示:

GET cars/_search
{
  "size":0,
  "aggs":{
    "models":{
      "terms":{
        "field":"model.keyword"
      }
    }
  }
}

结果如下:

{
  "took": 161,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5
        },
        {
          "key": "C",
          "doc_count": 5
        },
        {
          "key": "B",
          "doc_count": 2
        },
        {
          "key": "D",
          "doc_count": 1
        }
      ]
    }
  }
}

我们看 aggregations 这个 key 下面的即为返回结果。

SQL 语句中还有一项是 COUNT(DISTINCT color) color_count 用于计算每个 model 的颜色数,在 Elasticsearch 中我们需要使用一个指标类聚合 Cardinality ,进行不同值计数。语句如下:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        }
      }
    }
  }
}

其返回结果如下:

{
  "took": 74,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        },
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "B",
          "doc_count": 2,
          "color_count": {
            "value": 2
          }
        },
        {
          "key": "D",
          "doc_count": 1,
          "color_count": {
            "value": 1
          }
        }
      ]
    }
  }
}

结果中 color_count 即为每个 model 的颜色数,但这里所有的模型都返回了,我们只想要颜色数大于1的模型,因此这里还要加一个过滤条件。

4. Having Condition VS Bucket Filter Aggregation

Having color_count > 1 在 Elasticsearch 中对应的是 Bucket Filter 聚合,语句如下所示:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        },
        "color_count_filter": {
          "bucket_selector": {
            "buckets_path": {
              "colorCount": "color_count"
            },
            "script": "params.colorCount>1"
          }
        }
      }
    }
  }
}

返回结果如下:

{
  "took": 39,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        },
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "B",
          "doc_count": 2,
          "color_count": {
            "value": 2
          }
        }
      ]
    }
  }
}

此时返回结果只包含颜色数大于1的模型,但大家会发现颜色数多的 C 不是在第一个位置,我们还需要做排序处理。

5. Order By Limit VS Bucket Sort Aggregation

ORDER BY color_count desc LIMIT 3 在 Elasticsearch 中可以使用 Bucket Sort 聚合实现,语句如下所示:

GET cars/_search
{
  "size": 0,
  "aggs": {
    "models": {
      "terms": {
        "field": "model.keyword"
      },
      "aggs": {
        "color_count": {
          "cardinality": {
            "field": "color.keyword"
          }
        },
        "color_count_filter": {
          "bucket_selector": {
            "buckets_path": {
              "colorCount": "color_count"
            },
            "script": "params.colorCount>1"
          }
        },
        "color_count_sort": {
          "bucket_sort": {
            "sort": {
              "color_count": "desc"
            },
            "size": 2
          }
        }
      }
    }
  }
}

返回结果如下:

{
  "took": 32,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 13,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "models": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "C",
          "doc_count": 5,
          "color_count": {
            "value": 5
          }
        },
        {
          "key": "A",
          "doc_count": 5,
          "color_count": {
            "value": 4
          }
        }
      ]
    }
  }
}

至此我们便将 SQL 语句实现的功能用 Elasticsearch 查询语句实现了。对比 SQL 语句与 Elasticsearch 的查询语句,大家会发现后者复杂了很多,但并非无章可循,随着大家对常见语法越来越熟悉,相信一定会越写越得心应手!

收起阅读 »

新的kafka集群监控系统使用golang开发

开源地址:https://github.com/kppotato/kafka_monitor
 
项目使用:golang开发,数据库:prometheus 图形:grafana
开源地址:https://github.com/kppotato/kafka_monitor
 
项目使用:golang开发,数据库:prometheus 图形:grafana

_validate/query?explain解释

使用_validate/query?explain API得到的结果如下,Synonym是什么意思啊?同义词吗?求解释{
  "valid": true,
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "explanations": [
    {
      "index": "country",
      "valid": true,
      "explanation": "name:z Synonym(name:g name:zg)"
    }
  ]
}
继续阅读 »
使用_validate/query?explain API得到的结果如下,Synonym是什么意思啊?同义词吗?求解释{
  "valid": true,
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "explanations": [
    {
      "index": "country",
      "valid": true,
      "explanation": "name:z Synonym(name:g name:zg)"
    }
  ]
} 收起阅读 »