!python --version
Python 3.10.12

16. カテゴリデータに対する前処理コード例#

16.1. データセットの準備#

#!pip install quilt
#!quilt install haradai1262/YouTuber
#from quilt.data.haradai1262 import YouTuber
import pandas as pd

#df = YouTuber.channel_videos.UUUM_videos()
df = pd.read_excel("youtuber.xlsx")
df.head()
id title description liveBroadcastContent tags publishedAt thumbnails viewCount likeCount favoriteCount ... commentCount caption definition dimension duration projection TopicIds relevantTopicIds idx cid
0 R7V5d94XkGQ 【大食い】超高級寿司店で3人で食べ放題したらいくらかかるの!?【大トロ1カン2,000円】 提供:ポコロンダンジョンズ\n\n\n\niOS:https://bit.ly/2sGgOR... none ['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか... 2018-06-30T04:00:01.000Z https://i.ytimg.com/vi/R7V5d94XkGQ/default.jpg 2244205.0 27703.0 0 ... 8647.0 False hd 2d PT21M16S rectangular NaN ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w... 1 UCZf__ehlCEBPop___sldpBUQ
1 2R9_bkcWNd4 【女王集結】女性YouTuberたちと飲みながら本音トークしてみたら爆笑www しばなんチャンネルの動画\n\n\n\nhttps://www.youtube.com/wa... none ['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか... 2018-06-29T08:00:01.000Z https://i.ytimg.com/vi/2R9_bkcWNd4/default.jpg 1869268.0 30889.0 0 ... 8859.0 False hd 2d PT18M38S rectangular NaN ['/m/04rlf', '/m/02jjt', '/m/02jjt'] 2 UCZf__ehlCEBPop___sldpBUQ
2 EU8S-zxS9PI 【悪質】偽物ヒカキン許さねぇ…注意してください!【なりすまし】 ◆チャンネル登録はこちら↓\n\n\n\nhttp://www.youtube.com/us... none ['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか... 2018-06-27T08:38:55.000Z https://i.ytimg.com/vi/EU8S-zxS9PI/default.jpg 1724625.0 33038.0 0 ... 11504.0 False hd 2d PT6M12S rectangular NaN ['/m/04rlf', '/m/02jjt', '/m/02jjt'] 3 UCZf__ehlCEBPop___sldpBUQ
3 5wnfkIfw0jE ツイッターのヒカキンシンメトリーBotが面白すぎて爆笑www ◆チャンネル登録はこちら↓\n\n\n\nhttp://www.youtube.com/us... none ['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか... 2018-06-25T07:46:07.000Z https://i.ytimg.com/vi/5wnfkIfw0jE/default.jpg 1109029.0 25986.0 0 ... 6852.0 False hd 2d PT6M31S rectangular NaN ['/m/04rlf', '/m/02jjt', '/m/02jjt'] 4 UCZf__ehlCEBPop___sldpBUQ
4 -6duBsde_XM 【放送事故】酒飲みながら東海オンエア×ヒカキンで質問コーナーやったらヤバかったwww 提供:モンスターストライク\n\n\n\n▼キャンペーンサイトはこちら\n\n\n\nhtt... none ['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか... 2018-06-21T08:00:00.000Z https://i.ytimg.com/vi/-6duBsde_XM/default.jpg 1759797.0 33923.0 0 ... 4517.0 False hd 2d PT27M7S rectangular NaN ['/m/098wr', '/m/019_rr', '/m/02wbm', '/m/019_... 5 UCZf__ehlCEBPop___sldpBUQ

5 rows × 21 columns

16.2. 手法1:one-hotエンコーディング(one-hot encoding)#

df['cid'].value_counts().head()
cid
UCZf__ehlCEBPop___sldpBUQ    501
UC__AsSnEuyVgO9TWvZE_ziA     501
UCmol_xpWkIbQU0ZCuinqpQA     501
UCXqocGp__RQ_sTw8EpPDg10A    501
UCiJvvLq45i4sC4__dTiV88SQ    501
Name: count, dtype: int64
# one-hot encoding by pandas

one_hot_df = pd.get_dummies(df['cid'], dtype=int)

# check the one-hot vector
print(one_hot_df.values.shape)
print(df['cid'][0])
print(one_hot_df.values[0])
index = pd.Index(one_hot_df.values[0]).get_loc(1)
print('index = ', index)
print('cid = ', one_hot_df.columns[index])
(66289, 151)
UCZf__ehlCEBPop___sldpBUQ
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
index =  88
cid =  UCZf__ehlCEBPop___sldpBUQ
# one-hot encoding by sklearn

from sklearn import preprocessing
encoder = preprocessing.OneHotEncoder()
category = df['cid'].values.reshape(-1, 1)
encoder.fit(category)
one_hot_encoding = encoder.transform(category)

# check the one-hot vector
print(one_hot_encoding[0:10])
print(type(one_hot_encoding))
  (0, 88)	1.0
  (1, 88)	1.0
  (2, 88)	1.0
  (3, 88)	1.0
  (4, 88)	1.0
  (5, 88)	1.0
  (6, 88)	1.0
  (7, 88)	1.0
  (8, 88)	1.0
  (9, 88)	1.0
<class 'scipy.sparse._csr.csr_matrix'>

16.3. 手法2:特徴量ハッシング(Feature hashing)#

from sklearn.feature_extraction import FeatureHasher

category = df['cid'].values.reshape(-1, 1)

# if want, you can set the size of hash table (=n_features on FeatureHasher)
num_of_features = 5
hasher = FeatureHasher(n_features=num_of_features, input_type='string')
hashed_array = hasher.transform(category)

# check the result
print(hashed_array.shape)
print(df['cid'][0])
print(hashed_array.toarray()[0])

for i in range(0, len(df), 5000):
    print(df['cid'][i], hashed_array.toarray()[i])
(66289, 5)
UCZf__ehlCEBPop___sldpBUQ
[0. 1. 0. 0. 0.]
UCZf__ehlCEBPop___sldpBUQ [0. 1. 0. 0. 0.]
UC6wKgAlOeFNqmXV167KERhQ [0. 0. 0. 0. 1.]
UC4lZ8vGPy8bwmKILb__YlhzQ [0. 0. 0. 0. 1.]
UCKtKKtjaaPKA1Oj8Ldnfsdg [1. 0. 0. 0. 0.]
UCdtFmWwPlKiCOEND_95fwiA [ 0.  0.  0.  0. -1.]
UC2RdeFmVA1PrDqmFqJMG7hA [0. 0. 0. 1. 0.]
UCO06KZjWOe6b1tXrgzzakZA [0. 0. 0. 0. 1.]
UCg_Wchs_AGoHrlayD_rhO0Q [ 0. -1.  0.  0.  0.]
UC__8H678xX1SNBOM10_ReY6Q [1. 0. 0. 0. 0.]
UC2rbyOa3Jo7vGSibqKcRjqw [0. 0. 0. 1. 0.]
UCPJOCEIyI3gxXbTqKSsViqg [1. 0. 0. 0. 0.]
UCrOnS768WQGgNzvM0wOGa1w [0. 0. 0. 1. 0.]
UCjX7kJYLEAdsaCDnTsWK3Wg [0. 0. 1. 0. 0.]
UCdb7Jw5rprurSCutjT9BW5A [ 0. -1.  0.  0.  0.]

16.4. 手法3:BaseNエンコーディング(BaseN encoding)#

!pip install category_encoders
Requirement already satisfied: category_encoders in /usr/local/lib/python3.10/dist-packages (2.6.3)
Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (1.25.2)
Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (1.2.2)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (1.11.4)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (0.14.2)
Requirement already satisfied: pandas>=1.0.5 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (2.0.3)
Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.10/dist-packages (from category_encoders) (0.5.6)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.5->category_encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.5->category_encoders) (2023.4)
Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.5->category_encoders) (2024.1)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.1->category_encoders) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->category_encoders) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->category_encoders) (3.5.0)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from statsmodels>=0.9.0->category_encoders) (24.0)
import category_encoders as ce

encoder = ce.basen.BaseNEncoder(cols='cid', base=3)
result = encoder.fit_transform(df)

# check the result
columns = result.columns.tolist()
columns_name = [s for s in columns if "cid" in s]

def get_cid_values(df, names, index):
    temp = []
    for name in names:
        temp.append(df[name][index])
    return temp

for i in range(0, len(df), 5000):
    temp = get_cid_values(result, columns_name, i)
    print(df['cid'][i], temp)
UCZf__ehlCEBPop___sldpBUQ [0, 0, 0, 0, 1]
UC6wKgAlOeFNqmXV167KERhQ [0, 0, 1, 0, 1]
UC4lZ8vGPy8bwmKILb__YlhzQ [0, 0, 2, 1, 0]
UCKtKKtjaaPKA1Oj8Ldnfsdg [0, 1, 0, 1, 1]
UCdtFmWwPlKiCOEND_95fwiA [0, 1, 1, 2, 0]
UC2RdeFmVA1PrDqmFqJMG7hA [0, 1, 2, 2, 2]
UCO06KZjWOe6b1tXrgzzakZA [0, 2, 1, 0, 0]
UCg_Wchs_AGoHrlayD_rhO0Q [0, 2, 2, 0, 2]
UC__8H678xX1SNBOM10_ReY6Q [1, 0, 0, 1, 1]
UC2rbyOa3Jo7vGSibqKcRjqw [1, 0, 1, 2, 1]
UCPJOCEIyI3gxXbTqKSsViqg [1, 1, 0, 0, 2]
UCrOnS768WQGgNzvM0wOGa1w [1, 1, 1, 1, 2]
UCjX7kJYLEAdsaCDnTsWK3Wg [1, 1, 2, 2, 0]
UCdb7Jw5rprurSCutjT9BW5A [1, 2, 1, 1, 0]

16.5. 手法4:エビデンス重みエンコーディング(Weight of Evidence)#

import category_encoders as ce

encoder = ce.woe.WOEEncoder(cols='cid')

# ready for evidence
target = df['viewCount'] > 10000

# calculate WOE
result = encoder.fit_transform(df, y=target)

# check the result
for i in range(0, len(df), 5000):
    print(df['cid'][i], '\t', result['cid'][i])
UCZf__ehlCEBPop___sldpBUQ 	 3.6095662610015764
UC6wKgAlOeFNqmXV167KERhQ 	 0.4757909580572925
UC4lZ8vGPy8bwmKILb__YlhzQ 	 1.9921283059003998
UCKtKKtjaaPKA1Oj8Ldnfsdg 	 1.807796698178421
UCdtFmWwPlKiCOEND_95fwiA 	 -1.76268251990698
UC2RdeFmVA1PrDqmFqJMG7hA 	 -0.8357080911509357
UCO06KZjWOe6b1tXrgzzakZA 	 0.42925802928263423
UCg_Wchs_AGoHrlayD_rhO0Q 	 3.4570742314135954
UC__8H678xX1SNBOM10_ReY6Q 	 2.2192798786121486
UC2rbyOa3Jo7vGSibqKcRjqw 	 3.022177923131213
UCPJOCEIyI3gxXbTqKSsViqg 	 -0.2641732825478688
UCrOnS768WQGgNzvM0wOGa1w 	 -2.1435507408557353
UCjX7kJYLEAdsaCDnTsWK3Wg 	 -1.794934067712545
UCdb7Jw5rprurSCutjT9BW5A 	 -1.606225698959651