ElasticSearch分词器和聚合,数据的丰富和去重
- 1. analyzer
- 1.1. 什么是analysis?
- 1.2. 如何定义一个定制的分析器
- 1.3. 中文分词器
- 2. Aggregations
- 2.1. [Bucket aggregation]()
- 3. [Observability:使用 Elastic Stack 分析地理空间数据 ]()
- 3.1. mapping设置
- 3.2. fligths_logstash.conf 配置
- 3.3. 启动logstash并通过kibana分析数据
- 4. [把MySQL数据导入到Elasticsearch中]()
- 5. Logstash处理重复的文档
- 6. 数据的丰富
- 7. 寄语:程序员之所以犯错误,不是因为他们不懂,而是因为他们自以为什么都懂。
1. analyzer
1.1. 什么是analysis?
- 分析是Elasticsearch在文档发送之前对文档正文执行的过程,以添加到反向索引中(inverted index)
- 每当一个文档被ingest节点纳入,它需要经历如下的步骤,才能最终把文档写入到Elasticsearch的数据库中
1.2. 如何定义一个定制的分析器
- 在这里我们主要运用现有的plugin来完成定制的分析器
DELETE blogs
PUT blogs
{
"settings": {
"analysis": {
"char_filter": {
"xschool_filter": {
"type": "mapping",
"mappings": [
"X-Game => XGame"
]
}
},
"analyzer": {
"my_content_analyzer": {
"type": "custom",
"char_filter": [
"xschool_filter"
],
"tokenizer": "standard",
"filter": [
"lowercase",
"my_stop"
]
}
},
"filter": {
"my_stop": {
"type": "stop",
"stopwords": ["so", "to", "the"]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_content_analyzer"
}
}
}
}
1.3. 中文分词器
2. Aggregations
2.1. Bucket aggregation
我们将重点介绍直方图(histogram),范围(range),过滤器(filter)和术语(terms)等存储桶聚合
Bucket aggregation是一种把具有相同标准的数据分组数据的方法
- 案例1
- 样例数据
- 统计同一个category中同一个国家的平均年龄
GET users/_search
{
"size": 0,
"aggs": {
"categories": {
"terms": {
"field": "category"
},
"aggs": {
"countries": {
"terms": {
"field": "country"
},
"aggs": {
"average_age": {
"avg": {
"field": "age"
}
}
}
}
}
}
}
}
- 结果统计
- 案例2
- 过滤聚合(角色为defender和角色为forward的平均分)
GET sports/_search
{
"size": 0,
"aggs": {
"athletes": {
"filters": {
"filters": {
"defenders": {
"term": {
"role": "defender"
}
},
"forwards": {
"term": {
"role": "forward"
}
}
}
},
"aggs": {
"avg_goals": {
"avg": {
"field": "goals"
}
}
}
}
}
}
- 术语聚合(
Terms Aggregation
:sport每一类中的平均分):
GET sports/_search
{
"size": 0,
"aggs": {
"sports": {
"terms": {
"field": "sport"
},
"aggs": {
"avg_scoring": {
"avg": {
"field": "goals"
}
}
}
}
}
}
- 直方图聚合(
Histogram Aggregation
: sport为Basketball这一类在各个区间goals上的文档数)
POST sports/_search
{
"size": 0,
"aggs": {
"baskketball_filter": {
"filter": {
"term": {
"sport": "Basketball"
}
},
"aggs": {
"goals_histogram": {
"histogram": {
"field": "goals",
"interval": 200
}
}
}
}
}
}
- 日期直方图(
Date histogram aggregation
)
GET sports/_search
{
"size": 0,
"aggs": {
"birthdays": {
"date_histogram": {
"field": "birthdate",
"interval": "year"
}
}
}
}
GET sports/_search
{
"size": 0,
"aggs": {
"birthdays": {
"date_histogram": {
"field": "birthdate",
"interval": "year"
},
"aggs": {
"average_goals": {
"avg": {
"field": "goals"
}
}
}
}
}
}
- 范围聚合(
Range Aggregation
)
GET sports/_search
{
"size": 0,
"aggs": {
"goal_ranges": {
"range": {
"field": "age",
"ranges": [
{
"key": "start-of-career",
"to": 20
},
{
"key": "mid-of-career",
"from": 20,
"to": 30
},
{
"key": "end-of-cereer",
"from": 30
}
]
}
}
}
}
3. Observability:使用 Elastic Stack 分析地理空间数据
3.1. mapping设置
PUT flights
{
"mappings": {
"properties": {
"@timestamp": {
"type": "date"
},
"baro_altitude": {
"type": "float"
},
"callsign": {
"type": "keyword"
},
"geo_altitude": {
"type": "float"
},
"icao": {
"type": "keyword"
},
"last_contact": {
"type": "long"
},
"location": {
"type": "geo_point"
},
"on_ground": {
"type": "boolean"
},
"origin_country": {
"type": "keyword"
},
"position_source": {
"type": "keyword"
},
"request_time": {
"type": "long"
},
"spi": {
"type": "boolean"
},
"squawk": {
"type": "long"
},
"time_position": {
"type": "long"
},
"true_track": {
"type": "float"
},
"velocity": {
"type": "float"
},
"vertical_rate": {
"type": "float"
}
}
}
}
3.2. fligths_logstash.conf 配置
input {
http_poller {
codec => "json"
schedule => { every => "15s" }
urls => {
url => "https://opensky-network.org/api/states/all"
}
}
}
filter {
split {
field => "states"
add_field => {
"icao" => "%{[states][0]}"
"callsign" => "%{[states][1]}"
"origin_country" => "%{[states][2]}"
"time_position" => "%{[states][3]}"
"last_contact" => "%{[states][4]}"
"location" => "%{[states][6]},%{[states][5]}"
"baro_altitude" => "%{[states][7]}"
"on_ground" => "%{[states][8]}"
"velocity" => "%{[states][9]}"
"true_track" => "%{[states][10]}"
"vertical_rate" => "%{[states][11]}"
"geo_altitude" => "%{[states][13]}"
"squawk" => "%{[states][14]}"
"spi" => "%{[states][15]}"
"position_source" => "%{[states][16]}"
}
}
mutate {
strip => ["callsign"]
rename => { "time" => "request_time" }
remove_field => ["states", "@version"]
}
translate {
field => "[position_source]"
destination => "position_source"
override => "true"
dictionary => {
"0" => "ADS-B"
"1" => "ASTERIX"
"2" => "MLAB"
}
}
if [time_position] =~ /^%{*/ {
drop { }
}
if [callsign] =~ /^%{*/ {
mutate { remove_field => ["callsign"] }
}
if [location] =~ /^%{*/ {
mutate { remove_field => ["location"] }
}
if [baro_altitude] =~ /^%{*/ {
mutate { remove_field => ["baro_altitude"] }
}
if [velocity] =~ /^%{*/ {
mutate { remove_field => ["velocity"] }
}
if [true_track] =~ /^%{*/ {
mutate { remove_field => ["true_track"] }
}
if [vertical_rate] =~ /^%{*/ {
mutate { remove_field => ["vertical_rate"] }
}
if [sensors] =~ /^%{*/ {
mutate { remove_field => ["sensors"] }
}
if [geo_altitude] =~ /^%{*/ {
mutate { remove_field => ["geo_altitude"] }
}
if [squawk] =~ /^%{*/ {
mutate { remove_field => ["squawk"] }
}
mutate {
convert => {
"baro_altitude" => "float"
"geo_altitude" => "float"
"last_contact" => "integer"
"on_ground" => "boolean"
"request_time" => "integer"
"spi" => "boolean"
"squawk" => "integer"
"time_position" => "integer"
"true_track" => "float"
"velocity" => "float"
"vertical_rate" => "float"
}
}
}
output {
stdout {
codec => rubydebug
}
elasticsearch {
manage_template => "false"
index => "flights"
# pipeline => "flights_aircraft_enrichment"
hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
user => "elastic"
password => "cxYiWW4vFEE4nuubo8TZVyrY"
}
}
3.3. 启动logstash并通过kibana分析数据
4. 把MySQL数据导入到Elasticsearch中
- 下载数据集SalesJan2009.csv;【提取码;7kmn】, 通过
Navicat
导入到mysql数据库,数据格式如下: - 配置sales.conf
input {
jdbc {
jdbc_connection_string => "jdbc:mysql://192.168.1.30:3306/db_test"
jdbc_user => "root"
jdbc_password => "123456"
jdbc_validate_connection => true
jdbc_driver_class => "com.mysql.cj.jdbc.Driver"
parameters => { "Product_id" => "Product1" }
statement => "SELECT * FROM SalesJan2009 WHERE Product = :Product_id"
}
}
filter {
mutate {
rename => {
"longitude" => "[location][lon]"
"latitude" => "[location][lat]"
}
}
}
output {
stdout {
}
elasticsearch {
hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
index => "sales"
document_type => "_doc"
user => "elastic"
password => "cxYiWW4vFEE4nuubo8TZVyrY"
}
}
- 添加对应版本的驱动
- 启动logstash
bin/logstash -f config/sales.conf
,并且通过kibana分析 注意
- logstash 自定义生成 geoip/自定义的经纬度转换为 geo_point 类型
5. Logstash处理重复的文档
- 运用 fingerprint 过滤器处理重复的文档
- 如何在Elasticsearch中查找和删除重复文档
- 配置文件
input {
http {
id => "data_http_input"
}
}
filter {
fingerprint {
source => [ "sensor_id", "date"]
target => "[@metadata][fingerprint]"
method => "SHA1"
key => "xiaofan"
concatenate_sources => true
base64encode => true
}
}
output {
stdout {
codec => rubydebug
}
elasticsearch {
manage_template => "false"
index => "fingerprint"
hosts => [ "https://ab680dbcf3fa41d8b87e2d1e549bec77.asia-northeast1.gcp.cloud.es.io:9243" ]
document_id => "%{[@metadata][fingerprint]}"
user => "elastic"
password => "cxYiWW4vFEE4nuubo8TZVyrY"
}
}
注意:http input 来收集数据
- curl -XPOST --header “Content-Type:application/json” “http://localhost:8080/” -d ‘{“sensor_id”:1, “date”: “2015-01-01”, “reading”:16.24}’
6. 数据的丰富
- Logstash:Data转换,分析,提取,丰富及核心操作
- Logstash translate 过滤器简介
- 运用 Elasticsearch 过滤器来丰富数据
- 运用jdbc_streaming来丰富我们的数据