使用 shuf 来打乱一个文件中的行或是选择文件中一个随机的行。

elasticsearch query与agg分开是否可以提高性能?

Elasticsearch | 作者 Ryze | 发布于2021年07月07日 | 阅读数:1705


假如我有一个名为article 的索引,有1亿条记录,它的mappins文件和数据样式如下。我想问的是,像query1这样将query子句和agg子句拆分开使用bukl api去执行 是否可以获得比 query2 更高的性能呢? (elasticsearch version 7.x)
 
mappings:
{
"article" : {
"aliases" : { },
"mappings" : {
"properties" : {
"articleID" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"author_first_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
},
"copy_to" : [
"new_author_full_name"
]
},
"author_full_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"author_last_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
},
"copy_to" : [
"new_author_full_name"
]
},
"content" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"follower_num" : {
"type" : "long"
},
"hidden" : {
"type" : "boolean"
},
"new_author_first_name" : {
"type" : "text",
"copy_to" : [
"new_author_full_name"
]
},
"new_author_full_name" : {
"type" : "text"
},
"new_author_last_name" : {
"type" : "text",
"copy_to" : [
"new_author_full_name"
]
},
"postDate" : {
"type" : "date"
},
"sub_title" : {
"type" : "text",
"fields" : {
"std" : {
"type" : "text",
"analyzer" : "standard"
}
},
"analyzer" : "english"
},
"tag" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"tag_cnt" : {
"type" : "long"
},
"title" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"userID" : {
"type" : "long"
},
"view_cnt" : {
"type" : "long"
}
}
},
"settings" : {
"index" : {
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"number_of_shards" : "1",
"provided_name" : "article",
"creation_date" : "1609156191282",
"number_of_replicas" : "1",
"uuid" : "_LaU_8fuTlCsJ9ZvEqUCjA",
"version" : {
"created" : "7100199"
}
}
}
}
}

 数据样式:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "article",
"_type" : "_doc",
"_id" : "6",
"_score" : 1.0,
"_source" : {
"title" : "this is java and hadoop blog"
}
},
{
"_index" : "article",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"view_cnt" : 30,
"sub_title" : "learning more courses",
"author_last_name" : "Smith",
"hidden" : false,
"new_author_first_name" : "Peter",
"articleID" : "XHDK-A-1293-#fJ3",
"title" : "this is java and elasticsearch blog",
"userID" : 1,
"content" : "i like to write best elasticsearch article",
"author_first_name" : "Peter",
"tag_cnt" : 2,
"postDate" : "2017-01-01",
"tag" : [
"java",
"hadoop"
],
"new_author_last_name" : "Smith",
"follower_num" : 5
}
},
{
"_index" : "article",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"view_cnt" : 50,
"sub_title" : "learned a lot of course",
"author_last_name" : "Williams",
"hidden" : false,
"new_author_first_name" : "Smith",
"articleID" : "KDKE-B-9947-#kL5",
"title" : "this is java blog",
"userID" : 1,
"content" : "i think java is the best programming language",
"author_first_name" : "Smith",
"tag_cnt" : 1,
"postDate" : "2017-01-02",
"tag" : [
"java"
],
"new_author_last_name" : "Williams",
"follower_num" : 10
}
},
{
"_index" : "article",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"view_cnt" : 100,
"sub_title" : "we have a lot of fun",
"author_last_name" : "Ma",
"hidden" : false,
"new_author_first_name" : "Jack",
"articleID" : "JODL-X-1937-#pV7",
"title" : "this is elasticsearch blog",
"userID" : 2,
"content" : "i am only an elasticsearch beginner",
"author_first_name" : "Jack",
"tag_cnt" : 1,
"postDate" : "2017-01-01",
"tag" : [
"hadoop"
],
"new_author_last_name" : "Ma",
"follower_num" : 25
}
},
{
"_index" : "article",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"view_cnt" : 80,
"sub_title" : "both of them are good",
"author_last_name" : "Li",
"hidden" : true,
"new_author_first_name" : "Robbin",
"articleID" : "QQPX-R-3956-#aD8",
"title" : "this is java, elasticsearch, hadoop blog",
"userID" : 2,
"content" : "elasticsearch and hadoop are all very good solution, i am a beginner",
"author_first_name" : "Robbin",
"tag_cnt" : 2,
"postDate" : "2017-01-02",
"tag" : [
"java",
"elasticsearch"
],
"new_author_last_name" : "Li",
"follower_num" : 3
}
},
{
"_index" : "article",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"view_cnt" : 10,
"sub_title" : "haha, hello world",
"author_last_name" : "Peter Smith",
"hidden" : false,
"new_author_first_name" : "Tonny",
"articleID" : "DHJK-B-1395-#Ky5",
"title" : "this is spark blog",
"userID" : 3,
"content" : "spark is best big data solution based on scala ,an programming language similar to java",
"author_first_name" : "Tonny",
"tag_cnt" : 1,
"postDate" : "2017-03-01",
"tag" : [
"elasticsearch"
],
"new_author_last_name" : "Peter Smith",
"follower_num" : 60
}
}
]
}
}


query1: include a "query" dsl and a "aggs" dsl
 
### query
GET /article/_search
{
"from": 0,
"size": 1000,
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"hidden": {
"value": "false"
}
}
},
{
"match": {
"new_author_last_name": "Smith"
}
},
{
"range": {
"view_cnt": {
"gte": 1
}
}
},
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "i",
"fields": ["content"]
}
}
]
}
}
],
"must_not": [
{
"match": {
"author_first_name": "Danny"
}
}
]
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1
}
},
"field_value_factor": {
"field": "follower_num",
"factor": 1,
"missing": 22.5,
"modifier": "none"
}
}
]
}
},
"_source": false,
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"follower_num": {
"order": "desc",
"missing": 0
}
},
{
"view_cnt": {
"order": "desc",
"missing": 0
}
}
],

"collapse": {
"field": "userID"
}
}








### agg
GET /article/_search
{
"size": 0,
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"hidden": {
"value": "false"
}
}
},
{
"match": {
"new_author_last_name": "Smith"
}
},
{
"range": {
"view_cnt": {
"gte": 1
}
}
},
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "i",
"fields": ["content"]
}
}
]
}
}
],
"must_not": [
{
"match": {
"author_first_name": "Danny"
}
}
]
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1
}
},
"field_value_factor": {
"field": "follower_num",
"factor": 1,
"missing": 22.5,
"modifier": "none"
}
}
]
}
},
"_source": false,

"aggs": {
"userID_agg": {
"terms": {
"field": "userID",
"size": 100
}
}
}
}

query2:
GET /article/_search
{
"from": 0,
"size": 1000,
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"hidden": {
"value": "false"
}
}
},
{
"match": {
"new_author_last_name": "Smith"
}
},
{
"range": {
"view_cnt": {
"gte": 1
}
}
},
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "i",
"fields": ["content"]
}
}
]
}
}
],
"must_not": [
{
"match": {
"author_first_name": "Danny"
}
}
]
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1
}
},
"field_value_factor": {
"field": "follower_num",
"factor": 1,
"missing": 22.5,
"modifier": "none"
}
}
]
}
},
"_source": false,
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"follower_num": {
"order": "desc",
"missing": 0
}
},
{
"view_cnt": {
"order": "desc",
"missing": 0
}
}
],
"aggs": {
"userID_agg": {
"terms": {
"field": "userID",
"size": 100
}
}
},

"collapse": {
"field": "userID"
}
}
[code]
### query
GET /article/_search
{
"from": 0,
"size": 1000,
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"hidden": {
"value": "false"
}
}
},
{
"match": {
"new_author_last_name": "Smith"
}
},
{
"range": {
"view_cnt": {
"gte": 1
}
}
},
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "i",
"fields": ["content"]
}
}
]
}
}
],
"must_not": [
{
"match": {
"author_first_name": "Danny"
}
}
]
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1
}
},
"field_value_factor": {
"field": "follower_num",
"factor": 1,
"missing": 22.5,
"modifier": "none"
}
}
]
}
},
"_source": false,
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"follower_num": {
"order": "desc",
"missing": 0
}
},
{
"view_cnt": {
"order": "desc",
"missing": 0
}
}
],

"collapse": {
"field": "userID"
}
}








### agg
GET /article/_search
{
"size": 0,
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"hidden": {
"value": "false"
}
}
},
{
"match": {
"new_author_last_name": "Smith"
}
},
{
"range": {
"view_cnt": {
"gte": 1
}
}
},
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "i",
"fields": ["content"]
}
}
]
}
}
],
"must_not": [
{
"match": {
"author_first_name": "Danny"
}
}
]
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1
}
},
"field_value_factor": {
"field": "follower_num",
"factor": 1,
"missing": 22.5,
"modifier": "none"
}
}
]
}
},
"_source": false,

"aggs": {
"userID_agg": {
"terms": {
"field": "userID",
"size": 100
}
}
}
}
[/code]
已邀请:

tongchuan1992 - 学无止境、学以致用

赞同来自: Ryze

不会,es跟数据库一样,都是会对语句进行优化的,你这两条语句命中的文档数是一样的,所有并不会提升性能。我感觉第二种可能更快一点。

要回复问题请先登录注册