数据量大，es性能优化

Elasticsearch | 作者 febmark | 发布于2021年05月21日 | 阅读数：3383

我有个文章的爬虫库，mapping如下：

{

  "article": {

    "mappings": {

      "dynamic": "strict",

      "properties": {

        "article_id": { //文章id

          "type": "keyword"

        },

        "publish_date": { //文章发布时间

          "type": "date",

          "format": "yyyy-MM-dd"

        },

        "link": {   //文章链接

          "type": "keyword"

        },

        "site": { //文章来源站点

          "type": "keyword"

        },

        "article_author": { //文章作者

          "type": "text",

          "fields": {

            "keyword": {

              "type": "keyword",

              "ignore_above": 256

            }

          },

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart"

        },

        "article_title": { //文章标题

          "type": "text",

          "fields": {

            "keyword": {

              "type": "keyword",

              "ignore_above": 256

            },

            "pinyin": {

              "type": "text",

              "boost": 10.0,

              "term_vector": "with_offsets",

              "analyzer": "pinyin_analyzer"

            }

          },

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart"

        },

        "article_desc": { //文章描述

          "type": "text",

          "fields": {

            "keyword": {

              "type": "keyword",

              "ignore_above": 256

            },

            "pinyin": {

              "type": "text",

              "boost": 10.0,

              "term_vector": "with_offsets",

              "analyzer": "pinyin_analyzer"

            }

          },

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart"

        },

        "report": { //文章每天的报表数据统计

          "type": "nested",

          "properties": {

            "rdt_click": {//点击量

              "type": "integer",

              "index": false

            },

            "rdt_impression": {//曝光量

              "type": "integer",

              "index": false

            },

            "rdt_comment": {//评论数

              "type": "integer",

              "index": false

            },

            "rdt_like": {//点赞数

              "type": "integer",

              "index": false

            },

            "rdt_date": {//报表日期

              "type": "date",

              "format": "yyyy-MM-dd"

            }

          }

        }

      }

    }

  }

}

数据都存到一个索引里面，目前这个索引有30多G。有些查询很慢，经常报OOM的错误。现在是一篇文章对应一条记录，报表的数据通过nested字段存储。每天一条存到nested字段里面。想对这块做一个优化，提升查询的速度。

现在想按照report里面的每天的报表数据拆分开，索引按照月份来存，以report里面的rdt_date所在的月份创建索引。索引的结构如下。

{

  "article" : {

    "mappings" : {

      "dynamic" : "strict",

      "properties" : {

        "article_id" : {

          "type" : "keyword"

        },

        "publish_date" : {

          "type" : "date",

          "format" : "yyyy-MM-dd"

        },

        "link" : {

          "type" : "keyword"

        },

        "site" : {

          "type" : "keyword"

        },

        "article_author": { //文章作者

          "type": "text",

          "fields": {

            "keyword": {

              "type": "keyword",

              "ignore_above": 256

            }

          },

          "analyzer": "ik_max_word",

          "search_analyzer": "ik_smart"

        },

        "article_title" : {

          "type" : "text",

          "fields" : {

            "keyword" : {

              "type" : "keyword",

              "ignore_above" : 256

            },

            "pinyin" : {

              "type" : "text",

              "boost" : 10.0,

              "term_vector" : "with_offsets",

              "analyzer" : "pinyin_analyzer"

            }

          },

          "analyzer" : "ik_max_word",

          "search_analyzer" : "ik_smart"

        },

        "article_desc" : {

          "type" : "text",

          "fields" : {

            "keyword" : {

              "type" : "keyword",

              "ignore_above" : 256

            },

            "pinyin" : {

              "type" : "text",

              "boost" : 10.0,

              "term_vector" : "with_offsets",

              "analyzer" : "pinyin_analyzer"

            }

          },

          "analyzer" : "ik_max_word",

          "search_analyzer" : "ik_smart"

        },

        "rdt_click" : {

          "type" : "integer",

          "index" : false

        },

        "rdt_impression" : {

          "type" : "integer",

          "index" : false

        },

        "rdt_comment" : {

          "type" : "integer",

          "index" : false

        },

        "rdt_like" : {

          "type" : "integer",

          "index" : false

        },

        "rdt_date" : {

          "type" : "date",

          "format" : "yyyy-MM-dd"

        }

      }

    }

  }

}

没什么变化，只是把nested拆开了。但是这样的话，文档数跟储存空间就增加了很多。之前一篇文章有100条的数据，对应之前的索引是一条记录。但是现在则是100条记录。刷完数据算下来，存储空间占用了90G左右，是原来的3倍左右。

但是这个有些查询还是不能满足需求，有些月份的数据有40多G, 有些查询排序还是很慢，OOM。如果是按照全部月份索引全局查询，就更慢了。

想请教下这块应该怎么优化。有些数据它可能是全局的查询，比如按照曝光量倒序排序，找出一篇全网最高的文章，不限月份。

我还想通过histogram filed类型存储report报表的数据，这样还是对应一条数据，不会拆成多条，但是这个histogram filed字段不好过滤，比如只统计最近一个礼拜的报表数据。

2 个回复

f_on

你看看机器内存设置的多大？

匿名用户

真是滥用ES集群了.不管什么场景,不管合适不适合,全都抛给ES.

ES根本无法解决大数据量的查询,聚合.

人家只是个全文检索引擎,不是大数据计算引擎.

要回复问题请先登录或注册

数据量大，es性能优化

2 个回复

发起人

活动推荐

相关问题

问题状态

数据量大，es性能优化

与内容相关的链接

2 个回复

发起人

活动推荐

相关问题

问题状态