好的想法是十分钱一打,真正无价的是能够实现这些想法的人。

Elasticsearch 7.10.0 scripted_metric使用HashMap存储和cardinality去重查获取的结果集不一致

Elasticsearch | 作者 yuan0710shuai | 发布于2021年04月08日 | 阅读数:1637

GET repair/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"term": {
"typeName": {
"value": "书面",
"boost": 1
}
}
},
{
"range": {
"processStatus": {
"from": null,
"to": 12,
"include_lower": true,
"include_upper": false,
"boost": 1
}
}
},
{
"terms": {
"projectId": [
"499"
],
"boost": 1
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
},
"aggregations": {
"countOrder": {
"terms": {
"field": "projectId",
"size": 732,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"houseClose": {
"scripted_metric": {
"init_script": {
"source": "state.numas=new HashMap();",
"lang": "painless"
},
"map_script": {
"source": """
if(doc.houseId.size()>0){
String houseKey = doc.houseId.value;
state.numas.put(houseKey,1);
}
""",
"lang": "painless"
},
"combine_script": {
"source": """
double item_finish_count=0;
for(key in state.numas.keySet()){
item_finish_count+=1;
}
return item_finish_count;""",
"lang": "painless"
},
"reduce_script": {
"source": """double result=0;
for(e in states){
result+=e;
}
return result;""",
"lang": "painless"
},
"params": {
"close_sum_key": "close_sum3",
"house_sum_key": "house_sum3"
}
}
},
"houseCount": {
"cardinality": {
"field": "houseId"
}
}
}
}
}
}
结果集:
{
"took" : 392,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"countOrder" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "499",
"doc_count" : 17573,
"houseCount" : {
"value" : 1256
},
"houseClose" : {
"value" : 4102.0
}
}
]
}
}
}
问题:houseClose和houseCount数据应该一致,但是结果差异很大
已邀请:

amc - tbd

赞同来自: yuan0710shuai

因为分片重复计算导致的数据差异。比如一共4个文档,2个分片
分片1:a,c
分片2:b,c
此时cardinality结果为3,但scripted_metric结果为4(2+2,reduce_script过程的计算)
 
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "scripted_metric_demo",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"id" : "1",
"houseId" : "a"
}
},
{
"_index" : "scripted_metric_demo",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"id" : "3",
"houseId" : "b"
}
},
{
"_index" : "scripted_metric_demo",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"id" : "4",
"houseId" : "c"
}
},
{
"_index" : "scripted_metric_demo",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"id" : "4",
"houseId" : "c"
}
}
]
},
"aggregations" : {
"scripted_metric_count" : {
"value" : 4.0
},
"cardinality_count" : {
"value" : 3
}
}
}

要回复问题请先登录注册