ElasticSearch分词器和聚合,数据的丰富和去重

  • 1. analyzer
  • 1.1. 什么是analysis?
  • 1.2. 如何定义一个定制的分析器
  • 1.3. 中文分词器
  • 2. Aggregations
  • 2.1. [Bucket aggregation]()
  • 3. [Observability:使用 Elastic Stack 分析地理空间数据 ]()
  • 3.1. mapping设置
  • 3.2. fligths_logstash.conf 配置
  • 3.3. 启动logstash并通过kibana分析数据
  • 4. [把MySQL数据导入到Elasticsearch中]()
  • 5. Logstash处理重复的文档
  • 6. 数据的丰富
  • 7. 寄语:程序员之所以犯错误,不是因为他们不懂,而是因为他们自以为什么都懂。

1. analyzer

1.1. 什么是analysis?

  • 分析是Elasticsearch在文档发送之前对文档正文执行的过程,以添加到反向索引中(inverted index)
  • 每当一个文档被ingest节点纳入,它需要经历如下的步骤,才能最终把文档写入到Elasticsearch的数据库中

1.2. 如何定义一个定制的分析器

  • 在这里我们主要运用现有的plugin来完成定制的分析器
DELETE blogs
 
PUT blogs
{
  "settings": {
    "analysis": {
      "char_filter": {
        "xschool_filter": {
          "type": "mapping",
          "mappings": [
            "X-Game => XGame"
          ]
        }
      },
      "analyzer": {
        "my_content_analyzer": {
          "type": "custom",
          "char_filter": [
            "xschool_filter"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "my_stop"
          ]
        }
      },
      "filter": {
        "my_stop": {
          "type": "stop",
          "stopwords": ["so", "to", "the"]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "content": {
        "type": "text",
        "analyzer": "my_content_analyzer"
      }
    }
  }
}

1.3. 中文分词器

2. Aggregations

2.1. Bucket aggregation

  • 我们将重点介绍直方图(histogram),范围(range),过滤器(filter)和术语(terms)等存储桶聚合
  • Bucket aggregation是一种把具有相同标准的数据分组数据的方法
  • 案例1

  • 样例数据
  • es 根据两个字段去重_bc

  • 统计同一个category中同一个国家的平均年龄
GET users/_search
{
  "size": 0,
  "aggs": {
    "categories": {
      "terms": {
        "field": "category"
      },
      "aggs": {
        "countries": {
          "terms": {
            "field": "country"
          },
          "aggs": {
            "average_age": {
              "avg": {
                "field": "age"
              }
            }
          }
        }
      }
    }
  }
}

  • 结果统计
  • es 根据两个字段去重_Elastic_02

  • 案例2
  • 过滤聚合(角色为defender和角色为forward的平均分)
GET sports/_search
{
  "size": 0, 
  "aggs": {
    "athletes": {
      "filters": {
        "filters": {
          "defenders": {
            "term": {
              "role": "defender"
            }
          },
          "forwards": {
            "term": {
              "role": "forward"
            }
          }
        }
      },
      "aggs": {
        "avg_goals": {
          "avg": {
            "field": "goals"
          }
        }
      }
    }
  }
}

es 根据两个字段去重_es 根据两个字段去重_03

  • 术语聚合(Terms Aggregation:sport每一类中的平均分):
GET sports/_search
{
  "size": 0,
  "aggs": {
    "sports": {
      "terms": {
        "field": "sport"
      },
      "aggs": {
        "avg_scoring": {
          "avg": {
            "field": "goals"
          }
        }
      }
    }
  }
}

es 根据两个字段去重_分词器_04

  • 直方图聚合(Histogram Aggregation: sport为Basketball这一类在各个区间goals上的文档数)
POST sports/_search
{
  "size": 0,
  "aggs": {
    "baskketball_filter": {
      "filter": {
        "term": {
          "sport": "Basketball"
        }
      },
      "aggs": {
        "goals_histogram": {
          "histogram": {
            "field": "goals",
            "interval": 200
          }
        }
      }
    }
  }
}

es 根据两个字段去重_bc_05

  • 日期直方图(Date histogram aggregation)
GET sports/_search
{
  "size": 0,
  "aggs": {
    "birthdays": {
      "date_histogram": {
        "field": "birthdate",
        "interval": "year"
      }
    }
  }
}

es 根据两个字段去重_elasticsearch_06

GET sports/_search
{
  "size": 0,
  "aggs": {
    "birthdays": {
      "date_histogram": {
        "field": "birthdate",
        "interval": "year"
      },
      "aggs": {
        "average_goals": {
          "avg": {
            "field": "goals"
          }
        }
      }
    }
  }
}

es 根据两个字段去重_elasticsearch_07

  • 范围聚合(Range Aggregation)
GET sports/_search
{
  "size": 0,
  "aggs": {
    "goal_ranges": {
      "range": {
        "field": "age",
        "ranges": [
          {
            "key": "start-of-career",
            "to": 20
          },
          {
            "key": "mid-of-career",
            "from": 20,
            "to": 30
          },
          {
            "key": "end-of-cereer",
            "from": 30
          }
        ]
      }
    }
  }
}

es 根据两个字段去重_es 根据两个字段去重_08

3. Observability:使用 Elastic Stack 分析地理空间数据

3.1. mapping设置

PUT flights
{
  "mappings": {
    "properties": {
      "@timestamp": {
        "type": "date"
      },
      "baro_altitude": {
        "type": "float"
      },
      "callsign": {
        "type": "keyword"
      },
      "geo_altitude": {
        "type": "float"
      },
      "icao": {
        "type": "keyword"
      },
      "last_contact": {
        "type": "long"
      },
      "location": {
        "type": "geo_point"
      },
      "on_ground": {
        "type": "boolean"
      },
      "origin_country": {
        "type": "keyword"
      },
      "position_source": {
        "type": "keyword"
      },
      "request_time": {
        "type": "long"
      },
      "spi": {
        "type": "boolean"
      },
      "squawk": {
        "type": "long"
      },
      "time_position": {
        "type": "long"
      },
      "true_track": {
        "type": "float"
      },
      "velocity": {
        "type": "float"
      },
      "vertical_rate": {
        "type": "float"
      }
    }
  }
}

3.2. fligths_logstash.conf 配置

input {
    http_poller {
        codec => "json"
        schedule => { every => "15s" }
        urls => {
            url => "https://opensky-network.org/api/states/all"
        }
    }
}
 
filter {
    split {
        field => "states"
        add_field => {
            "icao" => "%{[states][0]}"
            "callsign" => "%{[states][1]}"
            "origin_country" => "%{[states][2]}"
            "time_position" => "%{[states][3]}"
            "last_contact" => "%{[states][4]}"
            "location" => "%{[states][6]},%{[states][5]}"
            "baro_altitude" => "%{[states][7]}"
            "on_ground" => "%{[states][8]}"
            "velocity" => "%{[states][9]}"
            "true_track" => "%{[states][10]}"
            "vertical_rate" => "%{[states][11]}"
            "geo_altitude" => "%{[states][13]}"
            "squawk" => "%{[states][14]}"
            "spi" => "%{[states][15]}"
            "position_source" => "%{[states][16]}"
        }
    }
    mutate {
        strip => ["callsign"]
        rename => { "time" => "request_time" }
        remove_field => ["states", "@version"]
    }
    translate {
        field => "[position_source]"
        destination => "position_source"
        override => "true"
        dictionary => {
          "0" => "ADS-B"
          "1" => "ASTERIX"
          "2" => "MLAB"
        }
    }
 
    if [time_position] =~ /^%{*/ {
        drop { }
    }
    if [callsign] =~ /^%{*/ {
        mutate { remove_field => ["callsign"] }
    }
    if [location] =~ /^%{*/ {
        mutate { remove_field => ["location"] }
    }
    if [baro_altitude] =~ /^%{*/ {
        mutate { remove_field => ["baro_altitude"] }
    }
    if [velocity] =~ /^%{*/ {
        mutate { remove_field => ["velocity"] }
    }
    if [true_track] =~ /^%{*/ {
        mutate { remove_field => ["true_track"] }
    }
    if [vertical_rate] =~ /^%{*/ {
        mutate { remove_field => ["vertical_rate"] }
    }
    if [sensors] =~ /^%{*/ {
        mutate { remove_field => ["sensors"] }
    }
    if [geo_altitude] =~ /^%{*/ {
        mutate { remove_field => ["geo_altitude"] }
    }
    if [squawk] =~ /^%{*/ {
        mutate { remove_field => ["squawk"] }
    }
 
    mutate {
		convert => { 
            "baro_altitude" => "float" 
		    "geo_altitude" => "float"
            "last_contact" => "integer"
            "on_ground" => "boolean"
            "request_time" => "integer"
            "spi" => "boolean"
            "squawk" => "integer"
            "time_position" => "integer"
            "true_track" => "float"
            "velocity" => "float"
            "vertical_rate" => "float"
        }
	}
}
 
output {
    stdout { 
        codec => rubydebug
    }
 
    elasticsearch {
        manage_template => "false"
        index => "flights"
        # pipeline => "flights_aircraft_enrichment"
     	hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
		user => "elastic"
		password => "cxYiWW4vFEE4nuubo8TZVyrY"
    }
}

3.3. 启动logstash并通过kibana分析数据

4. 把MySQL数据导入到Elasticsearch中

input {
	jdbc {
       jdbc_connection_string => "jdbc:mysql://192.168.1.30:3306/db_test"
       jdbc_user => "root"
       jdbc_password => "123456"
       jdbc_validate_connection => true
       jdbc_driver_class => "com.mysql.cj.jdbc.Driver"
       parameters => { "Product_id" => "Product1" }
       statement => "SELECT * FROM SalesJan2009 WHERE Product = :Product_id"
    }    
}
 
filter {
	mutate {
	 	rename => {
        	"longitude" => "[location][lon]"
        	"latitude" => "[location][lat]"
    	}
    }
}
 
output {
	stdout {
	}
 
   	elasticsearch {
	hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
    index => "sales"
	document_type => "_doc"
	user => "elastic"
	password => "cxYiWW4vFEE4nuubo8TZVyrY"
  }
}
  • 添加对应版本的驱动
  • 启动logstashbin/logstash -f config/sales.conf,并且通过kibana分析
  • 注意
  • logstash 自定义生成 geoip/自定义的经纬度转换为 geo_point 类型

5. Logstash处理重复的文档

  • 配置文件
input {
    http {
        id => "data_http_input"
    }
}
 
filter {
    fingerprint {
        source => [ "sensor_id", "date"]
        target => "[@metadata][fingerprint]"
        method => "SHA1"
        key => "xiaofan"
        concatenate_sources => true
        base64encode => true
    }
}
 
output {
    stdout {
        codec => rubydebug
    }
 
  elasticsearch {
        manage_template => "false"
        index => "fingerprint"
     	hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
        document_id => "%{[@metadata][fingerprint]}"
		user => "elastic"
		password => "cxYiWW4vFEE4nuubo8TZVyrY"
    }
}
  • 注意:http input 来收集数据
  • curl -XPOST --header “Content-Type:application/json” “http://localhost:8080/” -d ‘{“sensor_id”:1, “date”: “2015-01-01”, “reading”:16.24}’

6. 数据的丰富