我有点怀疑你在刷屏

Web Scraper + Elasticsearch + Kibana + SearchKit 打造的豆瓣电影top250 搜索演示系统

Elasticsearch | 作者 | 发布于2023年04月09日 | | 阅读数:4950

Web Scraper + Elasticsearch + Kibana + SearchKit 打造的豆瓣电影top250 搜索演示系统

作者:小森同学

声明:电影数据来源于“豆瓣电影”,如有侵权,请联系删除

1.png

2.png

Web Scraper

{
    "_id": "top250",
    "startUrl": ["https://movie.douban.com/top250?start=[0-225:25]&filter="],
    "selectors": [{
        "id": "container",
        "multiple": true,
        "parentSelectors": ["_root"],
        "selector": ".grid_view li",
        "type": "SelectorElement"
    }, {
        "id": "name",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "",
        "selector": "span.title:nth-of-type(1)",
        "type": "SelectorText"
    }, {
        "id": "number",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "",
        "selector": "em",
        "type": "SelectorText"
    }, {
        "id": "score",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "",
        "selector": "span.rating_num",
        "type": "SelectorText"
    }, {
        "id": "review",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "",
        "selector": "span.inq",
        "type": "SelectorText"
    }, {
        "id": "year",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "\\d{4}",
        "selector": "p:nth-of-type(1)",
        "type": "SelectorText"
    }, {
        "id": "tour_guide",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "^导演: \\S*",
        "selector": "p:nth-of-type(1)",
        "type": "SelectorText"
    }, {
        "id": "type",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "[^/]+$",
        "selector": "p:nth-of-type(1)",
        "type": "SelectorText"
    }, {
        "id": "area",
        "multiple": false,
        "parentSelectors": ["container"],
        "regex": "[^\\/]+(?=\\/[^\\/]*$)",
        "selector": "p:nth-of-type(1)",
        "type": "SelectorText"
    }, {
        "id": "detail_link",
        "multiple": false,
        "parentSelectors": ["container"],
        "selector": ".hd a",
        "type": "SelectorLink"
    }, {
        "id": "director",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "",
        "selector": "span:nth-of-type(1) .attrs a",
        "type": "SelectorText"
    }, {
        "id": "screenwriter",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "(?<=编剧: )[\\u4e00-\\u9fa5A-Za-z0-9/()\\·\\s]+(?=主演)",
        "selector": "div#info",
        "type": "SelectorText"
    }, {
        "id": "film_length",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "\\d+",
        "selector": "span[property='v:runtime']",
        "type": "SelectorText"
    }, {
        "id": "IMDb",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "(?<=[IMDb:\\s+])\\S*(?=\\d*$)",
        "selector": "div#info",
        "type": "SelectorText"
    }, {
        "id": "language",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "(?<=语言: )\\S+",
        "selector": "div#info",
        "type": "SelectorText"
    }, {
        "id": "alias",
        "multiple": false,
        "parentSelectors": ["detail_link"],
        "regex": "(?<=又名: )[\\u4e00-\\u9fa5A-Za-z0-9/()\\s]+(?=IMDb)",
        "selector": "div#info",
        "type": "SelectorText"
    }, {
        "id": "pic",
        "multiple": false,
        "parentSelectors": ["container"],
        "selector": "img",
        "type": "SelectorImage"
    }]
}

elasticsearch

{
    "mappings": {
      "properties": {
        "IMDb": {
          "type": "keyword",
          "copy_to": [
            "all"
          ]
        },
        "alias": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "all": {
          "type": "text",
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "area": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "director": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "film_length": {
          "type": "long"
        },
        "id": {
          "type": "keyword"
        },
        "language": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "link": {
          "type": "keyword"
        },
        "name": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "number": {
          "type": "long"
        },
        "photo": {
          "type": "keyword"
        },
        "review": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "score": {
          "type": "double"
        },
        "screenwriter": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "type": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          },
          "copy_to": [
            "all"
          ],
          "analyzer": "ik_max_word",
          "search_analyzer": "ik_smart"
        },
        "year": {
          "type": "long"
        }
      }
    }
  }

kibana

需要使用pipeline对索引字段进行处理,如对type 通过空格进行分割为数组等,可以参照官方文档或其他博客。

制作仪表板省略, 请自行搜索

SearchKit

参考 https://github.com/searchkit/searchkit-starter-app


[尊重社区原创,转载请保留或注明出处]
本文地址:http://elasticsearch.cn/article/14863


0 个评论

要回复文章请先登录注册