import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
# CounVectorizer 클래스는 단어의 빈도를 Count해 Vector로 만드는 sklearn의 클래스이다.

corpus = ['삶이란 흐르는 오케스트라 우리는 마에스트로.']

vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray()) # corpus로부터 각 단어의 빈도 수를 기록

[[1 1 1 1 1]]

vector.vocabulary_ # 각 단어의 인덱스가 어떻게 부여됐는지 보여준다.

{'삶이란': 1, '흐르는': 4, '오케스트라': 2, '우리는': 3, '마에스트로': 0}

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
# CounVectorizer 클래스는 단어의 빈도를 Count해 Vector로 만드는 sklearn의 클래스이다.

def tf_extractor(corpus):
    '''
    return a frequency-based DTM(Document-Term Matrix. 문서에 등장하는 각 단어들의 빈도를 행렬로 표현한 것)
    '''
    
    # CountVectorizer 클래스를 사용해 텍스트를 벡터화하는 vectorizer를 만든다
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    # min_df=1dlaus, 최소 적어도 하나의 문서에 사용된 단어들을 모두 포함한다
    # ngram_range(1,1) : 유니그램만 포함
    
    # 만들어둔 vectorizer로 나의 corpus를 벡터화해, 이를 features로 지정한다
    features = vectorizer.fit_transform(corpus) # fit_transform()은 fit()과 transform()을 함께 수행하는 메소드다
    return vectorizer, features

# corpus : a list of sentences
#corpus1 = ['Hi there, I am Ryu Han.']
corpus1 = ['안녕 나는 한수연.',
           '지금 자연어 처리를 공부하고 있어.',
           '돈 걱정 없이 공부하고 싶다.',
           '한탄은 넣어두는 게 좋겠지!']

tf_vectorizer, tf_features = tf_extractor(corpus1)

print(tf_vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

print(tf_features)

  (0, 5)	1
  (0, 2)	1
  (0, 12)	1
  (1, 10)	1
  (1, 8)	1
  (1, 11)	1
  (1, 1)	1
  (1, 7)	1
  (2, 1)	1
  (2, 0)	1
  (2, 6)	1
  (2, 4)	1
  (3, 13)	1
  (3, 3)	1
  (3, 9)	1

features = tf_features.todense() # todense() returns a matrix
features

matrix([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
        [1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

dtm_np = np.array(features) # 메트릭스를 nd array로 바꾸기. 텍스트를 벡터화한 결과
dtm_np

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
       [1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

numpy에는 배열(ndarray)과 행렬(matrix) 객체가 있는데 둘이 헷갈린다. 짚고가자.
ndarray 객체는 다양한 종류의 수치 연산을 위해 고안된 범용 n차 배열이다. (보다 효율적)
반면 matrix 객체는 선형대수 연산을 위해 특별히 공안된 객체다.
참고 글 : studymake 블로그 (https://studymake.tistory.com/408#:~:text=ndarray%EB%8A%94%20%EB%8B%A4%EC%96%91%ED%95%9C%20%EC%A2%85%EB%A5%98%EC%9D%98,%EC%B0%A8%EC%9D%B4%EC%A0%90%EC%9D%80%20%EB%AA%87%20%EA%B0%80%EC%A7%80%20%EC%95%88%EB%90%9C%EB%8B%A4.&text=ndarray%20%EB%8A%94%20'*'%EB%8A%94%20%EC%9A%94%EC%86%8C%EA%B0%84%20%EA%B3%B1%EC%85%88%EC%9D%B4%EB%8B%A4.)

dtm_np[0]

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0])

dtm_np[1]

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0])

문장에서 단어의 등장 여부, 등장 빈도로 벡터화됐다.
벡터화된 문장들은 사칙연산이 가능하다.

print(np.linalg.norm(dtm_np[1]-dtm_np[0]))

2.8284271247461903

feature_names = tf_vectorizer.get_feature_names()
feature_names

['걱정',
 '공부하고',
 '나는',
 '넣어두는',
 '싶다',
 '안녕',
 '없이',
 '있어',
 '자연어',
 '좋겠지',
 '지금',
 '처리를',
 '한수연',
 '한탄은']

def display_features(features, feature_names):
    df = pd.DataFrame(data=features, columns=feature_names)
    print(df)

display_features(features, feature_names)

   걱정  공부하고  나는  넣어두는  싶다  안녕  없이  있어  자연어  좋겠지  지금  처리를  한수연  한탄은
0   0     0   1     0   0   1   0   0    0    0   0    0    1    0
1   0     1   0     0   0   0   0   1    1    0   1    1    0    0
2   1     1   0     0   1   0   1   0    0    0   0    0    0    0
3   0     0   0     1   0   0   0   0    0    1   0    0    0    1

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus):
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

corpus = ['안녕 나는 한수연.',
           '지금 자연어 처리를 공부하고 있어.',
           '돈 걱정 없이 공부하고 싶다.',
           '한탄은 넣어두는 게 좋겠지!']

tfidf_vectorizer, tfidf_features = tfidf_extractor(corpus)

print(tfidf_vectorizer)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

print(tfidf_features)

  (0, 12)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (0, 5)	0.5773502691896257
  (1, 7)	0.4651619335222394
  (1, 1)	0.3667390112974172
  (1, 11)	0.4651619335222394
  (1, 8)	0.4651619335222394
  (1, 10)	0.4651619335222394
  (2, 4)	0.5254727492640658
  (2, 6)	0.5254727492640658
  (2, 0)	0.5254727492640658
  (2, 1)	0.41428875116588965
  (3, 9)	0.5773502691896257
  (3, 3)	0.5773502691896257
  (3, 13)	0.5773502691896257

features = tfidf_features.todense() # todense() returns a matrix
features

matrix([[0.        , 0.        , 0.57735027, 0.        , 0.        ,
         0.57735027, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.57735027, 0.        ],
        [0.        , 0.36673901, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.46516193, 0.46516193, 0.        ,
         0.46516193, 0.46516193, 0.        , 0.        ],
        [0.52547275, 0.41428875, 0.        , 0.        , 0.52547275,
         0.        , 0.52547275, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.57735027, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.57735027,
         0.        , 0.        , 0.        , 0.57735027]])

dtm_np = np.array(features) # 메트릭스를 nd array로 바꾸기. 텍스트를 벡터화한 결과
dtm_np

array([[0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.        ],
       [0.        , 0.36673901, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46516193, 0.46516193, 0.        ,
        0.46516193, 0.46516193, 0.        , 0.        ],
       [0.52547275, 0.41428875, 0.        , 0.        , 0.52547275,
        0.        , 0.52547275, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.57735027]])

numpy에는 배열(ndarray)과 행렬(matrix) 객체가 있는데 둘이 헷갈린다. 짚고가자.
ndarray 객체는 다양한 종류의 수치 연산을 위해 고안된 범용 n차 배열이다. (보다 효율적)
반면 matrix 객체는 선형대수 연산을 위해 특별히 공안된 객체다.
참고 글 : studymake 블로그 (https://studymake.tistory.com/408#:~:text=ndarray%EB%8A%94%20%EB%8B%A4%EC%96%91%ED%95%9C%20%EC%A2%85%EB%A5%98%EC%9D%98,%EC%B0%A8%EC%9D%B4%EC%A0%90%EC%9D%80%20%EB%AA%87%20%EA%B0%80%EC%A7%80%20%EC%95%88%EB%90%9C%EB%8B%A4.&text=ndarray%20%EB%8A%94%20'*'%EB%8A%94%20%EC%9A%94%EC%86%8C%EA%B0%84%20%EA%B3%B1%EC%85%88%EC%9D%B4%EB%8B%A4.)

dtm_np[0]

array([0.        , 0.        , 0.57735027, 0.        , 0.        ,
       0.57735027, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.57735027, 0.        ])

dtm_np[1]

array([0.        , 0.36673901, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.46516193, 0.46516193, 0.        ,
       0.46516193, 0.46516193, 0.        , 0.        ])

문장에서 단어의 등장 여부, 등장 빈도로 벡터화됐다.
벡터화된 문장들은 사칙연산이 가능하다.

print(np.linalg.norm(dtm_np[1]-dtm_np[0]))

1.4142135623730951

feature_names = tfidf_vectorizer.get_feature_names()
feature_names

['걱정',
 '공부하고',
 '나는',
 '넣어두는',
 '싶다',
 '안녕',
 '없이',
 '있어',
 '자연어',
 '좋겠지',
 '지금',
 '처리를',
 '한수연',
 '한탄은']

def display_features(features, feature_names):
    df = pd.DataFrame(data=features, columns=feature_names)
    print(df)
    print(type(df))

display_features(features, feature_names)

         걱정      공부하고       나는     넣어두는        싶다       안녕        없이  \
0  0.000000  0.000000  0.57735  0.00000  0.000000  0.57735  0.000000   
1  0.000000  0.366739  0.00000  0.00000  0.000000  0.00000  0.000000   
2  0.525473  0.414289  0.00000  0.00000  0.525473  0.00000  0.525473   
3  0.000000  0.000000  0.00000  0.57735  0.000000  0.00000  0.000000   

         있어       자연어      좋겠지        지금       처리를      한수연      한탄은  
0  0.000000  0.000000  0.00000  0.000000  0.000000  0.57735  0.00000  
1  0.465162  0.465162  0.00000  0.465162  0.465162  0.00000  0.00000  
2  0.000000  0.000000  0.00000  0.000000  0.000000  0.00000  0.00000  
3  0.000000  0.000000  0.57735  0.000000  0.000000  0.00000  0.57735  
<class 'pandas.core.frame.DataFrame'>

Ryu Han

[NLP] BoW, 문서에 어떤 단어가 몇 번 등장했는가로 벡터화하기 (feat. TF-IDF)

Posted by Ryu Han

1 Comments

Post a Comment

Contact Form