TextMining_lecture01(1)

ML 공부

TextMining_lecture01(1)

민지기il 2024. 11. 6. 17:02

# NLTK

: 자연어 처리를 위한 파이썬 패키지

- 토큰화하기

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

para = "Hello everyone. It's good to see you. Let's start our text mining class!"
print(sent_tokenize(para)) #문장으로 토큰화
print(word_tokenize(para)) #단어로 토큰화

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']

# RE(Regular Expression)

import re 
re.findall("[abc]", "How are you, boy?")
re.findall("[\w]", "3a 7b_'.&5c9d") #\w: 알파벳, 숫자, _ 까지 search
re.findall("[_]+","a_b, c__d, e__f") #+로 1회 이상 찾기
re.findall("[o]{2,4}", "oh, hoow are yoooooou, boooooooooy?") # oo~oooo찾기

['a', 'b']

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

['_', '__', '__']
['oo', 'oooo', 'oo', 'oooo', 'oooo']

# RegexpTokenizer

: re에 따라 tokenize 진행

text1="Sorry, I can't go there."
tokenizer = RegexpTokenizer("[\w']{3,}") #3자 이상
print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']

# Stopwords 제거

from nltk.corpus import stopwords
english_stops = set(stopwords.words('english')) #제거 할 단어

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower())

result=[word for word in tokens if word not in english_stops]
print(result)

['sorry', 'go', 'movie', 'yesterday']

stopwords.words('english')는 "the", "is", "in", "and"와 같이 영어에서 자주 사용되지만 중요한 의미를 갖지 않는 불용어들임

my_stopword=['i','go','to'] 처럼 직접 지정 가능

# Stemming

:어간 추출. 접미사,어미 제거 ex) running -> run, happily -> happi
stemming algorithms for english: Porter Stemmer & Lancaster Stemmer

- Porter Stemmer

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook

- Lancaster

from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'),stemmer.stem('cookery'),stemmer.stem('cookbooks'))

cook cookery cookbook

# Lemmatization

: 표제어 추출. 단어를 사전형으로 ex) running ->run, happily -> happy, better -> good
algorithms: WordNetLemmatizer & WordNet

# Part of speech tagging

: 품사 태깅 (명사, 대명사, 동사, 형용사 ...)
nltk.pos_tag(): 토큰화된 결과에 대한 품사를 태그하고 출력을 튜플 목록으로 반환한다.
여기서 각 튜플은 단어와 해당 품사로 구성된다.

import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]

my_tag_set = ['NN', 'VB', 'JJ']
my_words=[word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

: 원하는 품사만 추출하기

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']

# Konlpy

: 한국어 자연어 처리를 할 수 있는 파이썬 패키지

sentence = """절망의 반대가 희망은 아니다. 어두운 밤하늘에 별이 빛나듯 희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면 그 누가 세상을 비출어줄까.정희성, 희망 공부"""
from konlpy.tag import Okt
t=Okt()
print('Morphology:', t.morphs(sentence)) #형태소
print('Nouns:', t.nouns(sentence)) #명사
print('Part of speech tagging results:', t.pos(sentence)) #품사

Morphology: ['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', ... 이하생략]
Nouns: ['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', ... 이하생략]
Part of speech tagging results: [('절망', 'Noun'), ('의', 'Josa'), ('반대', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ... 이하생략]