python es查询所有数据 python读取es数据

转载

mob64ca140e76c8 2023-09-21 00:43:27

文章标签 python es查询所有数据 MySQL Elastic bc 文章分类 Python 后端开发

文章目录

基本调用

安装
基本调用
封装自用

ES基础

curl命令交互
读写
常用查询语句

查询所有
简单查询
非空
分页+排序
范围查询
多条件查询（bool）
聚合

terms

其它

appendix

基本调用

安装

conda insatll elasticsearch

基本调用

from elasticsearch import Elasticsearch

HOSTS = 'http://abc.com'
INDEX = 'abc'  # 索引（类似MySQL库名）

es = Elasticsearch(HOSTS)
js = es.search(INDEX, {'query': {'match_all': {}}})
print(js)

>>> from elasticsearch import Elasticsearch

# 默认连接localhost:9200
>>> es = Elasticsearch()

# 写
>>> es.index(index="my-index", doc_type="test-type", id=42, body={"any": "data"})
{'_id': '42', '_index': 'my-index', '_type': 'test-type', '_version': 1, 'ok': True}

# 读
>>> es.get(index="my-index", doc_type="test-type", id=42)['_source']
{'any': 'data'}

封装自用

from elasticsearch import Elasticsearch

HOSTS = 'http://abc.com'
INDEX = 'abc'  # 索引名（≈MySQL库名）
SIZE = 100
SCROLL = '5m'
DOC_TYPE = '_doc'
# SORT_KEY = '_id'  # 排序key名


class ES:
    def __init__(self, hosts=HOSTS):
        self.es = Elasticsearch(hosts)

    @staticmethod
    def yellow(x):
        print('\033[93m{}\033[0m'.format(x))

    def search(self, body=None, index=INDEX):
        return self.es.search(index, body or {'query': {'match_all': {}}})  # dict

    def scroll(self, body, index=INDEX, size=SIZE, return_ls=False):
        """分批取数"""
        js = self.es.search(index, body, scroll=SCROLL, size=size)
        scroll_id = js['_scroll_id']  # 卷动ID：用于取出剩余数据
        if return_ls:
            yield js['hits']['hits']  # 产出首批数据
            total = js['hits']['total']  # 结果总数
            for _ in range(total // SIZE):
                yield self.es.scroll(scroll_id=scroll_id, scroll=SCROLL)['hits']['hits']  # 产出剩余数据
        else:
            hits = js['hits']['hits']
            while hits:
                for i in hits:
                    yield i  # 从列表中取数
                hits = self.es.scroll(scroll_id=scroll_id, scroll=SCROLL)['hits']['hits']

    def index(self, body, index, i):
        """数据写入"""
        self.es.index(index, body, doc_type=DOC_TYPE, id=i)

    def delete(self, i, index):
        """按_id删除"""
        self.es.delete(index, i, doc_type=DOC_TYPE)

    def delete_by_query(self, body, index):
        """按查询语句删除"""
        self.es.delete_by_query(index, body)

    def exists(self, i, index):
        """根据_id判断数据是否存在"""
        return self.es.exists(index, i, doc_type=DOC_TYPE)

    def get_source(self, i, index, source=None):
        """根据_id精准查询数据内容（查不到将会报错）"""
        return self.es.get_source(index, i, _source=source) if source else self.es.get_source(index, i)

    def count(self, body, index, doc_type=DOC_TYPE):
        """条件统计"""
        return self.es.count(doc_type, index, body)

    def proportion_not_null(self, index, field=None):
        """非空统计"""
        a = self.count(index)['count']
        b = self.count(index, {'query': {'bool': {'must': {'exists': {'field': field}}}}})['count']
        print(field, a, b, b / a)

    def aggs_terms(self, index, field, size=15):
        """单字段统计"""
        return self.search({
            'aggs': {
                'CUSTOM NAME': {
                    'terms': {
                        'field': field,
                        'size': size,  # 解决aggs显示不全
                    }
                }
            }
        }, index)['aggregations']['CUSTOM NAME']['buckets']


es = ES()
scroll = es.scroll


if __name__ == '__main__':
    print(es.search({
        # '_source': ["dataType"],
        'size': 5,  # 10000最大？
        'query': {'range': {'id': {'gt': '616000000000000000'}}}
    })

print

{
	'took': 614,
	'timed_out': False,
	'_shards': {
		'total': 1,
		'successful': 1,
		'skipped': 0,
		'failed': 0
	},
	'hits': {
		'total': {
			'value': 10000,
			'relation': 'gte'
		},
		'max_score': 1.0,
		'hits': [{
			'_index': 'abc',  # 类似MySQL的库
			'_type': '_doc',  # 类似MySQL库中的表
			'_id': '9948942229923430',  # 类似MySQL表中的主键
			'_score': 1.0,
			'_source': {
				'title': '森林公园林地资源',
			}
		}, {
			'_index': 'abc',
			'_type': '_doc',
			'_id': '9948937613253017',
			'_score': 1.0,
			'_source': {
				'title': '小型微型企业创业创新示范基地',
			}
		}]
	}
}

ES基础

curl命令交互

cURL是一个利用URL语法在命令行下工作的文件传输工具（CommandLine Uniform Resource Locator）

curl -X<VERB> '<PROTOCOL>://<HOST>:<PORT>/<PATH>?<QUERY_STRING>' -d '<BODY>'

被<>标记的部件	解析
VERB	合适的HTTP方法：GET、 POST、 PUT、 HEAD、DELETE…
PROTOCOL	协议，如：http、https
HOST	Elasticsearch 集群中任意节点的主机名，或者用`localhost`代表本地机节点
PORT	端口号，默认`9200`
PATH	API 的终端路径（如：`_count`将返回集群中文档数量）可能含多个组件，如：`_cluster/stats`和`_nodes/stats/jvm`
QUERY_STRING	任意可选的查询字符串参数（常用`?pretty`使输出格式化成`JSON`）
BODY	一个`JSON`格式的请求体

e.g.

curl -XGET 'http://localhost:9200/_count?pretty' -d '
{"query": {"match_all": {}}}
'

读写

写

PUT /megacorp/employee/1
{
    "full_name" : "小基基",
    "advantage": [ "数学", "物理" ]
}

节点	示例	解释
_index	megacorp：索引名称	类似MySQL的库
_type	employee：类型名称	类似MySQL的库中的表
_id	1：特定雇员的ID	类似MySQL的表中的主键

读

GET /megacorp/employee/1

常用查询语句

查询所有

{'query': {'match_all': {}}}

简单查询

{
    '_source': ['id'],
    'query': {'match': {'full_name': '老婆'}},
}

非空

{'query': {'bool': {'must': {'exists': {'field': field}}}}}  # 存在
{'query': {'bool': {'must_not': {'exists': {'field': field}}}}}  # 非存在

分页+排序

{
    'from': 20,
    'size': 10,
    'query': {'match': {'full_name': '老婆'}},
    'sort': {'_id': {'order': 'asc'}}
}

范围查询

{
    'query': {
        'range': {
            'id': {
                'gte': '615000000000000000',  # 大于等于
                'lt': '616000000000000000',  # 小于
            },
        }
    },
}

多条件查询（bool）

返回的文档必须满足must子句的条件，并且参与计分 filter 返回的文档必须满足filter子句的条件，但不像must那样参与计分 should 返回的文档可能满足should子句的条件。

在一个Bool查询中，如果没

must或

filter，有一个或者多个

should子句，那么只要满足一个就可以返回。

minimum_should_match参数定义了至少满足多少个子句

must_not 返回的文档必须不满足must_not定义的条件

{
    'query': {
        'bool': {
            'must': [
                {
                    'match': {
                        'valid': 'Y'
                    }
                }, {
                    'range': {
                        'id': {
                            'gte': '615000000000000000',
                            'lt': '616000000000000000',
                        }
                    }
                }
            ]
        }
    },
}

聚合

terms

{
    'aggs': {
        'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa': {
            'terms': {
                'field': 'year',  # 按年份聚合统计
                'size': 20,  # 解决aggs显示不全
            }
        }
    }
}

其它

from corpora.elastic_search import es

# 匹配+短语
match_phrase = {
    '_source': ['title'],
    'size': 20,
    'query': {
        'bool': {
            'must': [
                {
                    'match': {
                        'valid': 'Y'
                    }
                }, {
                    'match_phrase': {
                        'title': '复工'
                    }
                },
            ]
        }
    },
    'sort': {'_id': {'order': 'asc'}}
}


# 匹配+正则
match_regexp = {
    '_source': ['title'],
    # 'size': 20,
    'query': {
        'bool': {
            'must': [
                {
                    'match': {
                        'valid': 'Y'
                    }
                }, {
                    'regexp': {
                        'title': '申报|申请|专项资金|扶持',
                    }
                },
            ]
        }
    },
    'sort': {'_id': {'order': 'asc'}}
}

for i in scroll(_body):
    print(i['_source'])