新環境を構築するとき忘れがちなので、覚え書きとして残しておく。
1.インストール
sudo apt update
sudo apt upgrade
sudo apt install mecab libmecab-dev mecab-ipadic-utf8 git make
curl xz-utils file
sudo apt install python3-pip
pip3 install mecab-python3 unidic-lite neologdn
git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a
Usage:
$ mecab -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd ...
2.使用方法
import re
import neologdn
import subprocess
import MeCab
CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '形容動詞', '副詞')
STOP_WORD_POS = ('代名詞', '助動詞', '非自立', '数', '人名')
# 正規化
def normalize(text):
return neologdn.normalize(re.sub(r'[0-9]+', '0', text).lower())
# 形態素解析
def analysis(text):
cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
path = (subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]).decode('utf-8')
tagger = MeCab.Tagger('-d {0}'.format(path))
tagger.parse('')
node = tagger.parseToNode(normalize(text))
word = ''
pre_features = []
while node:
features = node.feature.split(',')[:6]
# 名詞、動詞、形容詞、形容動詞、副詞であるか判定
valid = False
for pos in CONTENT_WORD_POS:
valid = pos in features or valid
# 以下に該当する場合は除外(ストップワード)
for pos in STOP_WORD_POS:
valid = (not pos in features) and valid
if valid:
if ('名詞接続' in pre_features and '名詞' in features) or ('接尾' in features):
word += '{0}'.format(node.surface)
else:
word += ' {0}'.format(node.surface)
#print('{0} {1}'.format(node.surface, features))
pre_features = features
node = node.next
return word[1:]