概览
Elasticsearch 是一个基于 Lucene 的分布式搜索与分析引擎,提供近实时的全文搜索、结构化搜索和数据分析能力。通过 RESTful API 和 JSON 交互,是 ELK Stack(Elasticsearch + Logstash + Kibana)的核心。广泛用于日志分析、站内搜索、应用监控、推荐系统等场景。支持水平扩展、自动分片和副本。
Elasticsearch 是一个基于 Lucene 的分布式搜索与分析引擎,提供近实时的全文搜索、结构化搜索和数据分析能力。通过 RESTful API 和 JSON 交互,是 ELK Stack(Elasticsearch + Logstash + Kibana)的核心。广泛用于日志分析、站内搜索、应用监控、推荐系统等场景。支持水平扩展、自动分片和副本。
| 要求 | 说明 |
|---|---|
| 操作系统 | Linux、macOS、Windows(WSL2 或 Docker) |
| Java 运行时 | 内置 OpenJDK(ES 7.x+ 自包含),无需单独安装 |
| 内存 | 最低 2 GB,推荐 4 GB+ |
| 磁盘 | SSD 推荐,至少 5 GB |
| 端口 | 9200(HTTP REST)、9300(节点间通信) |
# 单节点启动
docker run -d --name elasticsearch \
-p 9200:9200 -p 9300:9300 \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=false" \
-e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
-v es_data:/usr/share/elasticsearch/data \
docker.elastic.co/elasticsearch/elasticsearch:8.15.0
# 验证
curl http://localhost:9200/
# 输出: {"name":"...","version":{"number":"8.15.0"},"tagline":"You Know, for Search"}
# 导入签名密钥
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
# 添加仓库
echo "deb https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
sudo apt update
sudo apt install -y elasticsearch
# 配置(可选:禁用安全认证以便开发)
sudo sed -i 's/xpack.security.enabled: true/xpack.security.enabled: false/' /etc/elasticsearch/elasticsearch.yml
sudo sed -i 's/xpack.security.enrollment.enabled: true/xpack.security.enrollment.enabled: false/' /etc/elasticsearch/elasticsearch.yml
# 启动
sudo systemctl start elasticsearch
sudo systemctl enable elasticsearch
brew tap elastic/tap
brew install elastic/tap/elasticsearch-full
elasticsearch # 前台启动
# Docker Compose 一键启动 ES + Kibana
cat > docker-compose.yml << 'EOF'
version: '3'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ports:
- "9200:9200"
kibana:
image: docker.elastic.co/kibana/kibana:8.15.0
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
ports:
- "5601:5601"
depends_on:
- elasticsearch
EOF
docker-compose up -d
# 访问 http://localhost:5601
Linux 下必须提高 mmap 计数:
sudo sysctl -w vm.max_map_count=262144
# 永久生效
echo "vm.max_map_count=262144" | sudo tee -a /etc/sysctl.conf
sudo vim /etc/security/limits.conf
# 添加:
elasticsearch - nofile 65535
elasticsearch - nproc 4096
ES 默认绑定 0.0.0.0,检查是否启用了安全认证(需要 HTTPS 和密码)。开发环境建议:
-e "xpack.security.enabled=false"
限制 JVM 堆大小(ES_JAVA_OPTS)不超过物理内存的 50%,且最大 32 GB。
创建索引、插入文档、执行全文搜索和聚合分析,理解 ES 的 RESTful 搜索范式。
# 1. 创建索引(含 mapping)
curl -X PUT "http://localhost:9200/library" -H "Content-Type: application/json" -d '{
"mappings": {
"properties": {
"title": {"type": "text", "analyzer": "standard"},
"author": {"type": "keyword"},
"year": {"type": "integer"},
"pages": {"type": "integer"},
"tags": {"type": "keyword"},
"description": {"type": "text"}
}
}
}'
# 2. 批量插入文档(_bulk API)
curl -X POST "http://localhost:9200/library/_bulk" -H "Content-Type: application/json" -d '
{"index": {"_id": "1"}}
{"title": "深入理解计算机系统", "author": "Randal E. Bryant", "year": 2015, "pages": 1080, "tags": ["cs", "systems"], "description": "从程序员视角理解计算机系统"}
{"index": {"_id": "2"}}
{"title": "算法导论", "author": "Thomas H. Cormen", "year": 2009, "pages": 1312, "tags": ["algorithms", "textbook"], "description": "全面介绍算法的经典教材"}
{"index": {"_id": "3"}}
{"title": "计算机网络:自顶向下方法", "author": "James F. Kurose", "year": 2017, "pages": 864, "tags": ["networking", "textbook"], "description": "计算机网络入门经典"}
{"index": {"_id": "4"}}
{"title": "操作系统概念", "author": "Abraham Silberschatz", "year": 2018, "pages": 976, "tags": ["os", "textbook"], "description": "操作系统原理经典教材"}
'
# 3. 全文搜索(match query)
curl -X GET "http://localhost:9200/library/_search" -H "Content-Type: application/json" -d '{
"query": {
"match": {
"title": "计算机 系统"
}
}
}'
# 4. 多字段搜索
curl -X GET "http://localhost:9200/library/_search" -H "Content-Type: application/json" -d '{
"query": {
"multi_match": {
"query": "algorithm textbook",
"fields": ["title^2", "description"]
}
}
}'
# 5. 过滤器 + 搜索
curl -X GET "http://localhost:9200/library/_search" -H "Content-Type: application/json" -d '{
"query": {
"bool": {
"must": {"match": {"description": "经典"}},
"filter": {"range": {"year": {"gte": 2015}}}
}
}
}'
# 6. 聚合分析(按 tag 分组统计)
curl -X GET "http://localhost:9200/library/_search" -H "Content-Type: application/json" -d '{
"size": 0,
"aggs": {
"by_tag": {
"terms": {"field": "tags"}
}
}
}'
# 7. 高亮显示
curl -X GET "http://localhost:9200/library/_search" -H "Content-Type: application/json" -d '{
"query": {"match": {"description": "经典"}},
"highlight": {
"fields": {"description": {}}
}
}'
# pip install elasticsearch
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200'])
# 创建索引
es.indices.create(index='library', body={
'mappings': {
'properties': {
'title': {'type': 'text'},
'author': {'type': 'keyword'},
'year': {'type': 'integer'},
'tags': {'type': 'keyword'}
}
}
}, ignore=400) # ignore 400 索引已存在
# 插入文档
doc = {
'title': '深入理解计算机系统',
'author': 'Randal E. Bryant',
'year': 2015,
'tags': ['cs', 'systems']
}
es.index(index='library', id=1, body=doc)
# 搜索
result = es.search(index='library', body={
'query': {'match': {'title': '计算机'}},
'highlight': {'fields': {'title': {}}}
})
for hit in result['hits']['hits']:
print(f"得分 {hit['_score']:.2f}: {hit['_source']['title']}")
if 'highlight' in hit:
print(f" 高亮: {hit['highlight']['title']}")
# 聚合
agg_result = es.search(index='library', body={
'size': 0,
'aggs': {'popular_tags': {'terms': {'field': 'tags'}}}
})
for bucket in agg_result['aggregations']['popular_tags']['buckets']:
print(f"{bucket['key']}: {bucket['doc_count']} 本")
// 全文搜索结果
{
"hits": {
"total": {"value": 2},
"hits": [
{"_source": {"title": "深入理解计算机系统"}, "_score": 1.2},
{"_source": {"title": "计算机网络:自顶向下方法"}, "_score": 0.8}
]
}
}
// 聚合结果
{
"aggregations": {
"by_tag": {
"buckets": [
{"key": "textbook", "doc_count": 3},
{"key": "algorithms", "doc_count": 1}
]
}
}
}
text 类型字段会被分词,适合全文搜索;keyword 类型用于精确匹配和聚合_bulk API 高效批量导入bool query 组合 must/should/must_not/filteraggregations 实现分组统计,类似 SQL 的 GROUP BY_score(相关度评分)降序排列使用 Python elasticsearch 客户端:创建索引、写入文档、全文搜索、聚合查询。
pip install elasticsearch
# 确保 ES 运行中
docker run -d --name es -p 9200:9200 -e "discovery.type=single-node" \
-e "xpack.security.enabled=false" elasticsearch:8.12.0
from elasticsearch import Elasticsearch
es = Elasticsearch('http://localhost:9200')
# 创建索引
mapping = {
"mappings": {
"properties": {
"title": {"type": "text", "analyzer": "ik_max_word"},
"content": {"type": "text", "analyzer": "ik_max_word"},
"price": {"type": "float"},
"category": {"type": "keyword"},
"created_at": {"type": "date"}
}
}
}
es.indices.create(index='products', body=mapping, ignore=400)
# 批量写入
from elasticsearch.helpers import bulk
docs = [
{"_index": "products", "_id": 1, "title": "STM32F103开发板", "content": "ARM Cortex-M3 72MHz", "price": 49.9, "category": "开发板"},
{"_index": "products", "_id": 2, "title": "树莓派4B", "content": "四核 Cortex-A72 4GB RAM", "price": 299, "category": "开发板"},
{"_index": "products", "_id": 3, "title": "USB转TTL模块", "content": "CH340G 3.3V/5V", "price": 9.9, "category": "连接器"},
]
bulk(es, docs)
# 全文搜索
query = {
"query": {
"multi_match": {
"query": "Cortex ARM",
"fields": ["title^2", "content"] # title 权重翻倍
}
},
"highlight": {
"fields": {"content": {}}
}
}
result = es.search(index='products', body=query)
for hit in result['hits']['hits']:
print(f"⭐ {hit['_source']['title']} (评分: {hit['_score']:.2f})")
# 聚合查询
agg_query = {
"size": 0,
"aggs": {
"by_category": {
"terms": {"field": "category"},
"aggs": {
"avg_price": {"avg": {"field": "price"}},
"price_range": {
"range": {"field": "price", "ranges": [
{"to": 50}, {"from": 50, "to": 200}, {"from": 200}
]}
}
}
}
}
}
result = es.search(index='products', body=agg_query)
for bucket in result['aggregations']['by_category']['buckets']:
print(f"{bucket['key']}: {bucket['doc_count']} 个商品, 均价 ¥{bucket['avg_price']['value']:.2f}")
搜索 "Cortex ARM" 会按相关度排序返回开发板。聚合查询按分类汇总商品数量和均价。
Elasticsearch 的核心是倒排索引——不同于顺序扫描,它为每个词建立"词→文档列表"映射:
正向索引: doc1 → ["深入", "理解", "计算机", "系统"]
倒排索引: "计算机" → [doc1, doc3, doc7]
"系统" → [doc1, doc4, doc9]
查询"计算机 系统"时,直接取交集或并集,O(1) 定位。
| ES 概念 | 类比 SQL | 说明 |
|---|---|---|
| Index | Database | 文档集合 |
| Type(7.x 废弃) | Table | 同一索引下不再分类型 |
| Document | Row | JSON 格式记录 |
| Field | Column | 文档属性 |
| Mapping | Schema | 字段类型定义 |
| Shard | Partition | 数据分片(水平切分) |
| Replica | Replica | 副本(高可用) |
电商平台需要支持:关键词搜索、分类过滤、价格区间、排序、自动补全。
PUT /products
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"ik_smart_analyzer": {
"type": "custom",
"tokenizer": "ik_smart"
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "standard",
"fields": {
"keyword": {"type": "keyword"}
}
},
"category": {"type": "keyword"},
"price": {"type": "double"},
"brand": {"type": "keyword"},
"rating": {"type": "float"},
"stock": {"type": "integer"},
"description": {"type": "text"},
"created_at": {"type": "date"}
}
}
}
from elasticsearch import Elasticsearch
from faker import Faker
import random
es = Elasticsearch(['http://localhost:9200'])
fake = Faker()
categories = ['电子产品', '图书', '服装', '食品', '家居']
brands = ['华为', '苹果', '小米', '索尼', '三星', '美的']
for i in range(1, 101):
product = {
'name': f"{fake.word().capitalize()} {random.choice(['Pro', 'Max', 'Lite', 'Air'])}",
'category': random.choice(categories),
'price': round(random.uniform(9.9, 9999), 2),
'brand': random.choice(brands),
'rating': round(random.uniform(1, 5), 1),
'stock': random.randint(0, 500),
'description': fake.sentence(),
'created_at': fake.date_this_year().isoformat()
}
es.index(index='products', id=i, body=product)
es.indices.refresh(index='products')
print(f"已索引 100 个商品, 总数: {es.count(index='products')['count']}")
def search_products(keyword, category=None, min_price=None, max_price=None, sort_by=None):
"""电商搜索引擎"""
must = []
filters = []
if keyword:
must.append({
'multi_match': {
'query': keyword,
'fields': ['name^3', 'description', 'brand^2']
}
})
else:
must.append({'match_all': {}}) # 无关键词时返回全部
if category:
filters.append({'term': {'category': category}})
if min_price is not None or max_price is not None:
price_range = {}
if min_price: price_range['gte'] = min_price
if max_price: price_range['lte'] = max_price
filters.append({'range': {'price': price_range}})
body = {
'query': {
'bool': {
'must': must,
'filter': filters
}
},
'highlight': {
'fields': {'name': {}, 'description': {}}
}
}
# 排序
if sort_by == 'price_asc':
body['sort'] = [{'price': 'asc'}]
elif sort_by == 'price_desc':
body['sort'] = [{'price': 'desc'}]
elif sort_by == 'rating':
body['sort'] = [{'rating': 'desc'}]
result = es.search(index='products', body=body)
return result['hits']
# 搜索示例
hits = search_products(keyword='手机', category='电子产品', min_price=100, max_price=5000, sort_by='rating')
print(f"找到 {hits['total']['value']} 条结果")
for hit in hits['hits']:
print(f" {hit['_source']['name']} - ¥{hit['_source']['price']} - ⭐{hit['_source']['rating']}")
# 按分类统计数量
agg_result = es.search(index='products', body={
'size': 0,
'aggs': {
'category_stats': {'terms': {'field': 'category'}},
'avg_price': {'avg': {'field': 'price'}},
'price_histogram': {
'histogram': {'field': 'price', 'interval': 500}
}
}
})
print(agg_result['aggregations']['avg_price'])