文档
NLTK 经典 NLP Pipeline
目标
展示 NLTK 的标准 NLP 处理流程:分词 → 词性标注 → 命名实体识别 → 词干提取 → 词频统计。
完整代码
import nltk
import ssl
# 首次运行取消注释:
# try: _create_unverified_https_context = ssl._create_unverified_context
# except: pass
# ssl._create_default_https_context = _create_unverified_https_context
# nltk.download(["punkt", "punkt_tab", "averaged_perceptron_tagger",
# "maxent_ne_chunker", "words", "stopwords", "wordnet"])
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
text = """
Elon Musk announced on Monday that Tesla's new factory in Shanghai
will produce over 500,000 electric vehicles annually.
The groundbreaking ceremony was attended by local officials and Tesla executives.
Tim Cook, the CEO of Apple, also visited Beijing last week.
"""
# ─── 1. 分词 ───
tokens = word_tokenize(text)
print(f"分词: {tokens[:10]}...")
# ─── 2. 去停用词 + 保留字母词 ───
stop_words = set(stopwords.words("english"))
clean_tokens = [w.lower() for w in tokens if w.isalpha() and w.lower() not in stop_words]
print(f"\n清洗后: {clean_tokens}")
# ─── 3. 词性标注 ───
pos_tags = pos_tag(tokens)
print(f"\n词性标注:")
for word, tag in pos_tags:
if word.isalpha():
print(f" {word:<20} → {tag}")
# ─── 4. 命名实体识别 ───
ner_tree = ne_chunk(pos_tags)
print(f"\n命名实体:")
for subtree in ner_tree:
if hasattr(subtree, "label"):
entity = " ".join([leaf[0] for leaf in subtree.leaves()])
print(f" {subtree.label():<10} | {entity}")
# ─── 5. 词干提取 vs 词形还原 ───
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
print(f"\n词干 / 词形还原对比:")
for word in ["running", "vehicles", "factories", "better", "attended"]:
print(f" {word:<12} → stem: {stemmer.stem(word):<12} | lemma: {lemmatizer.lemmatize(word)}")
# ─── 6. 词频统计 ───
fdist = FreqDist(clean_tokens)
print(f"\nTop 10 高频词:")
for word, freq in fdist.most_common(10):
print(f" {word:<15} {freq}")
# ─── 7. WordNet 语义 ───
from nltk.corpus import wordnet
for synset in wordnet.synsets("vehicle", pos=wordnet.NOUN):
print(f"\nSynset: {synset.name()}")
print(f" 定义: {synset.definition()}")
print(f" 例句: {synset.examples()}")
运行步骤
pip install nltk
python nltk_pipeline.py
预期输出
分词: ['Elon', 'Musk', 'announced', 'on', 'Monday', ...]
词性标注:
Elon → NNP (专有名词)
Musk → NNP
announced → VBD (动词过去式)
Tesla → NNP
...
命名实体:
PERSON | Elon Musk
GPE | Shanghai
PERSON | Tim Cook
ORG | Apple
GPE | Beijing
词干 / 词形还原对比:
running → stem: run | lemma: running
vehicles → stem: vehicl | lemma: vehicle
factories → stem: factori | lemma: factory
Top 10 高频词:
tesla 2
musk 1
new 1
...