{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.1"},"colab":{"name":"preprocess_categorical.ipynb","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"X8CRRkxzOkh5"},"source":["# カテゴリデータに対する前処理コード例\n","- ref.\n","    - preprocess methods\n","        - [機械学習のための特徴量エンジニアリング](https://www.oreilly.co.jp/books/9784873118680/)\n","        - [5.3. Preprocessing data](https://scikit-learn.org/stable/modules/preprocessing.html#normalization)\n","        - [Categorical Data, UNDERSTANDING FEATURE ENGINEERING (PART 2)](https://towardsdatascience.com/understanding-feature-engineering-part-2-categorical-data-f54324193e63)\n","        - [Category Encoders](https://contrib.scikit-learn.org/category_encoders/index.html)\n","    - data: [YouTuberデータセット公開してみた](https://qiita.com/myaun/items/7e0dd7f3f9d9d2fef497)\n","- 全体の流れ\n","    - データセットの準備\n","    - 手法1：one-hotエンコーディング\n","    - 手法2：特徴量ハッシング\n","    - 手法3：BaseNエンコーディング\n","    - 手法4：エビデンス重みエンコーディング\n"]},{"cell_type":"markdown","metadata":{"id":"H5XIUQWLOkiC"},"source":["## データセットの準備"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aq7Xzm6aPYjc","executionInfo":{"status":"ok","timestamp":1617613170258,"user_tz":-540,"elapsed":7822,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"3a3d0cbb-0494-436f-d189-9f54074aef8e"},"source":["!pip install quilt\n","!quilt install haradai1262/YouTuber"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Requirement already satisfied: quilt in /usr/local/lib/python3.7/dist-packages (2.9.15)\n","Requirement already satisfied: pyarrow>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (3.0.0)\n","Requirement already satisfied: future>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (0.16.0)\n","Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.7/dist-packages (from quilt) (20.9)\n","Requirement already satisfied: requests>=2.12.4 in /usr/local/lib/python3.7/dist-packages (from quilt) (2.23.0)\n","Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (1.19.5)\n","Requirement already satisfied: xlrd>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (1.1.0)\n","Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (1.15.0)\n","Requirement already satisfied: pandas>=0.21.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (1.1.5)\n","Requirement already satisfied: pyyaml>=3.12 in /usr/local/lib/python3.7/dist-packages (from quilt) (3.13)\n","Requirement already satisfied: tqdm>=4.11.2 in /usr/local/lib/python3.7/dist-packages (from quilt) (4.41.1)\n","Requirement already satisfied: appdirs>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from quilt) (1.4.4)\n","Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=16.8->quilt) (2.4.7)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.12.4->quilt) (1.24.3)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.12.4->quilt) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.12.4->quilt) (2020.12.5)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.12.4->quilt) (3.0.4)\n","Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.21.0->quilt) (2.8.1)\n","Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.21.0->quilt) (2018.9)\n","Downloading package metadata...\n","haradai1262/YouTuber already installed.\n","Overwrite? (y/n) n\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":938},"id":"PDK2k7qeOkiD","executionInfo":{"status":"ok","timestamp":1617613171684,"user_tz":-540,"elapsed":9239,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"2c9e3a73-8d09-4d7a-91db-0d75146655b7"},"source":["from quilt.data.haradai1262 import YouTuber\n","import pandas as pd\n","\n","df = YouTuber.channel_videos.UUUM_videos()\n","df.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>title</th>\n","      <th>description</th>\n","      <th>liveBroadcastContent</th>\n","      <th>tags</th>\n","      <th>publishedAt</th>\n","      <th>thumbnails</th>\n","      <th>viewCount</th>\n","      <th>likeCount</th>\n","      <th>favoriteCount</th>\n","      <th>dislikeCount</th>\n","      <th>commentCount</th>\n","      <th>caption</th>\n","      <th>definition</th>\n","      <th>dimension</th>\n","      <th>duration</th>\n","      <th>projection</th>\n","      <th>TopicIds</th>\n","      <th>relevantTopicIds</th>\n","      <th>idx</th>\n","      <th>cid</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>R7V5d94XkGQ</td>\n","      <td>【大食い】超高級寿司店で３人で食べ放題したらいくらかかるの!?【大トロ1カン2,000円】</td>\n","      <td>提供：ポコロンダンジョンズ\\r\\r\\r\\r\\niOS：https://bit.ly/2sGg...</td>\n","      <td>none</td>\n","      <td>['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか...</td>\n","      <td>2018-06-30T04:00:01.000Z</td>\n","      <td>https://i.ytimg.com/vi/R7V5d94XkGQ/default.jpg</td>\n","      <td>2244205.0</td>\n","      <td>27703.0</td>\n","      <td>0</td>\n","      <td>3667.0</td>\n","      <td>8647.0</td>\n","      <td>False</td>\n","      <td>hd</td>\n","      <td>2d</td>\n","      <td>PT21M16S</td>\n","      <td>rectangular</td>\n","      <td>NaN</td>\n","      <td>['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...</td>\n","      <td>1</td>\n","      <td>UCZf__ehlCEBPop___sldpBUQ</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2R9_bkcWNd4</td>\n","      <td>【女王集結】女性YouTuberたちと飲みながら本音トークしてみたら爆笑www</td>\n","      <td>しばなんチャンネルの動画\\r\\r\\r\\r\\nhttps://www.youtube.com/...</td>\n","      <td>none</td>\n","      <td>['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか...</td>\n","      <td>2018-06-29T08:00:01.000Z</td>\n","      <td>https://i.ytimg.com/vi/2R9_bkcWNd4/default.jpg</td>\n","      <td>1869268.0</td>\n","      <td>30889.0</td>\n","      <td>0</td>\n","      <td>3483.0</td>\n","      <td>8859.0</td>\n","      <td>False</td>\n","      <td>hd</td>\n","      <td>2d</td>\n","      <td>PT18M38S</td>\n","      <td>rectangular</td>\n","      <td>NaN</td>\n","      <td>['/m/04rlf', '/m/02jjt', '/m/02jjt']</td>\n","      <td>2</td>\n","      <td>UCZf__ehlCEBPop___sldpBUQ</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>EU8S-zxS9PI</td>\n","      <td>【悪質】偽物ヒカキン許さねぇ…注意してください！【なりすまし】</td>\n","      <td>◆チャンネル登録はこちら↓\\r\\r\\r\\r\\nhttp://www.youtube.com/...</td>\n","      <td>none</td>\n","      <td>['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか...</td>\n","      <td>2018-06-27T08:38:55.000Z</td>\n","      <td>https://i.ytimg.com/vi/EU8S-zxS9PI/default.jpg</td>\n","      <td>1724625.0</td>\n","      <td>33038.0</td>\n","      <td>0</td>\n","      <td>4298.0</td>\n","      <td>11504.0</td>\n","      <td>False</td>\n","      <td>hd</td>\n","      <td>2d</td>\n","      <td>PT6M12S</td>\n","      <td>rectangular</td>\n","      <td>NaN</td>\n","      <td>['/m/04rlf', '/m/02jjt', '/m/02jjt']</td>\n","      <td>3</td>\n","      <td>UCZf__ehlCEBPop___sldpBUQ</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>5wnfkIfw0jE</td>\n","      <td>ツイッターのヒカキンシンメトリーBotが面白すぎて爆笑www</td>\n","      <td>◆チャンネル登録はこちら↓\\r\\r\\r\\r\\nhttp://www.youtube.com/...</td>\n","      <td>none</td>\n","      <td>['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか...</td>\n","      <td>2018-06-25T07:46:07.000Z</td>\n","      <td>https://i.ytimg.com/vi/5wnfkIfw0jE/default.jpg</td>\n","      <td>1109029.0</td>\n","      <td>25986.0</td>\n","      <td>0</td>\n","      <td>5063.0</td>\n","      <td>6852.0</td>\n","      <td>False</td>\n","      <td>hd</td>\n","      <td>2d</td>\n","      <td>PT6M31S</td>\n","      <td>rectangular</td>\n","      <td>NaN</td>\n","      <td>['/m/04rlf', '/m/02jjt', '/m/02jjt']</td>\n","      <td>4</td>\n","      <td>UCZf__ehlCEBPop___sldpBUQ</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>-6duBsde_XM</td>\n","      <td>【放送事故】酒飲みながら東海オンエア×ヒカキンで質問コーナーやったらヤバかったwww</td>\n","      <td>提供：モンスターストライク\\r\\r\\r\\r\\n▼キャンペーンサイトはこちら\\r\\r\\r\\r\\...</td>\n","      <td>none</td>\n","      <td>['ヒカキン', 'ヒカキンtv', 'hikakintv', 'hikakin', 'ひか...</td>\n","      <td>2018-06-21T08:00:00.000Z</td>\n","      <td>https://i.ytimg.com/vi/-6duBsde_XM/default.jpg</td>\n","      <td>1759797.0</td>\n","      <td>33923.0</td>\n","      <td>0</td>\n","      <td>2150.0</td>\n","      <td>4517.0</td>\n","      <td>False</td>\n","      <td>hd</td>\n","      <td>2d</td>\n","      <td>PT27M7S</td>\n","      <td>rectangular</td>\n","      <td>NaN</td>\n","      <td>['/m/098wr', '/m/019_rr', '/m/02wbm', '/m/019_...</td>\n","      <td>5</td>\n","      <td>UCZf__ehlCEBPop___sldpBUQ</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["            id  ...                        cid\n","0  R7V5d94XkGQ  ...  UCZf__ehlCEBPop___sldpBUQ\n","1  2R9_bkcWNd4  ...  UCZf__ehlCEBPop___sldpBUQ\n","2  EU8S-zxS9PI  ...  UCZf__ehlCEBPop___sldpBUQ\n","3  5wnfkIfw0jE  ...  UCZf__ehlCEBPop___sldpBUQ\n","4  -6duBsde_XM  ...  UCZf__ehlCEBPop___sldpBUQ\n","\n","[5 rows x 21 columns]"]},"metadata":{"tags":[]},"execution_count":2}]},{"cell_type":"markdown","metadata":{"id":"HEhdkmh5OkiE"},"source":["## 手法1：one-hotエンコーディング(one-hot encoding)\n","- [pandas.get_dummies](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html)\n","- [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OEN75L4cOkiF","executionInfo":{"status":"ok","timestamp":1617613171684,"user_tz":-540,"elapsed":9232,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"fce2f6fc-f405-4761-b096-ed74d93ffe80"},"source":["df['cid'].value_counts().head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["UCsX8MJHEI5UukXoF3HLnTvg      501\n","UCMsuwHzQPFMDtHaoR7_HDxg      501\n","UC66VyLEdgCot__4w8x__n0CGA    501\n","UCtLo4nwb3ObCDZ4m8b8u7fA      501\n","UCOZ7Kq5_VWBC__TtteAcsRBg     501\n","Name: cid, dtype: int64"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"code","metadata":{"scrolled":true,"colab":{"base_uri":"https://localhost:8080/"},"id":"R9dR6atuOkiF","executionInfo":{"status":"ok","timestamp":1617613171685,"user_tz":-540,"elapsed":9226,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"4105f6ef-7fa4-46cc-e588-b477c99572db"},"source":["# one-hot encoding by pandas\n","\n","one_hot_df = pd.get_dummies(df['cid'])\n","\n","# check the one-hot vector\n","print(one_hot_df.values.shape)\n","print(df['cid'][0])\n","print(one_hot_df.values[0])\n","index = pd.Index(one_hot_df.values[0]).get_loc(1)\n","print('index = ', index)\n","print('cid = ', one_hot_df.columns[index])"],"execution_count":null,"outputs":[{"output_type":"stream","text":["(66289, 151)\n","UCZf__ehlCEBPop___sldpBUQ\n","[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0]\n","index =  88\n","cid =  UCZf__ehlCEBPop___sldpBUQ\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EL4bhNO-OkiG","executionInfo":{"status":"ok","timestamp":1617613172058,"user_tz":-540,"elapsed":9593,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"b980b672-7fd4-4980-8fc2-88697e809402"},"source":["# one-hot encoding by sklearn\n","\n","from sklearn import preprocessing\n","encoder = preprocessing.OneHotEncoder()\n","category = df['cid'].values.reshape(-1, 1)\n","encoder.fit(category)\n","one_hot_encoding = encoder.transform(category)\n","\n","# check the one-hot vector\n","print(one_hot_encoding[0:10])\n","print(type(one_hot_encoding))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["  (0, 88)\t1.0\n","  (1, 88)\t1.0\n","  (2, 88)\t1.0\n","  (3, 88)\t1.0\n","  (4, 88)\t1.0\n","  (5, 88)\t1.0\n","  (6, 88)\t1.0\n","  (7, 88)\t1.0\n","  (8, 88)\t1.0\n","  (9, 88)\t1.0\n","<class 'scipy.sparse.csr.csr_matrix'>\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Mth4RqogOkiG"},"source":["## 手法2：特徴量ハッシング(Feature hashing)\n","- sklearn : [5.2.2 Feature hashing](https://scikit-learn.org/stable/modules/feature_extraction.html#feature-hashing)\n","- wikipedia: [Feature hashing](https://en.wikipedia.org/wiki/Feature_hashing)"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gBcLvwTFOkiG","executionInfo":{"status":"ok","timestamp":1617613172058,"user_tz":-540,"elapsed":9586,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"ea3cce3f-bf59-4fe4-e27b-8de4da01646e"},"source":["from sklearn.feature_extraction import FeatureHasher\n","\n","category = df['cid'].values.reshape(-1, 1)\n","\n","# if want, you can set the size of hash table (=n_features on FeatureHasher)\n","num_of_features = 5\n","hasher = FeatureHasher(n_features=num_of_features, input_type='string')\n","hashed_array = hasher.transform(category)\n","\n","# check the result\n","print(hashed_array.shape)\n","print(df['cid'][0])\n","print(hashed_array.toarray()[0])\n","\n","for i in range(0, len(df), 5000):\n","    print(df['cid'][i], hashed_array.toarray()[i])"],"execution_count":null,"outputs":[{"output_type":"stream","text":["(66289, 5)\n","UCZf__ehlCEBPop___sldpBUQ\n","[0. 1. 0. 0. 0.]\n","UCZf__ehlCEBPop___sldpBUQ [0. 1. 0. 0. 0.]\n","UC6wKgAlOeFNqmXV167KERhQ [0. 0. 0. 0. 1.]\n","UC4lZ8vGPy8bwmKILb__YlhzQ [0. 0. 0. 0. 1.]\n","UCKtKKtjaaPKA1Oj8Ldnfsdg [1. 0. 0. 0. 0.]\n","UCdtFmWwPlKiCOEND_95fwiA [ 0.  0.  0.  0. -1.]\n","UC2RdeFmVA1PrDqmFqJMG7hA [0. 0. 0. 1. 0.]\n","UCO06KZjWOe6b1tXrgzzakZA [0. 0. 0. 0. 1.]\n","UCg_Wchs_AGoHrlayD_rhO0Q [ 0. -1.  0.  0.  0.]\n","UC__8H678xX1SNBOM10_ReY6Q [1. 0. 0. 0. 0.]\n","UC2rbyOa3Jo7vGSibqKcRjqw [0. 0. 0. 1. 0.]\n","UCPJOCEIyI3gxXbTqKSsViqg [1. 0. 0. 0. 0.]\n","UCrOnS768WQGgNzvM0wOGa1w [0. 0. 0. 1. 0.]\n","UCjX7kJYLEAdsaCDnTsWK3Wg [0. 0. 1. 0. 0.]\n","UCdb7Jw5rprurSCutjT9BW5A [ 0. -1.  0.  0.  0.]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"eX6NIOydOkiH"},"source":["## 手法3：BaseNエンコーディング(BaseN encoding)\n","- [BaseN](https://contrib.scikit-learn.org/category_encoders/basen.html)\n","- [BASEN ENCODING AND GRID SEARCH IN CATEGORY_ENCODERS](http://www.willmcginnis.com/2016/12/18/basen-encoding-grid-search-category_encoders/)"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vfG4U1TYQPJw","executionInfo":{"status":"ok","timestamp":1617613174689,"user_tz":-540,"elapsed":12211,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"ef49942a-63ba-426a-e857-413d31b87c09"},"source":["!pip install category_encoders"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Requirement already satisfied: category_encoders in /usr/local/lib/python3.7/dist-packages (2.2.2)\n","Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.4.1)\n","Requirement already satisfied: pandas>=0.21.1 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.1.5)\n","Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (0.22.2.post1)\n","Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.19.5)\n","Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (0.5.1)\n","Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (0.10.2)\n","Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.21.1->category_encoders) (2.8.1)\n","Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.21.1->category_encoders) (2018.9)\n","Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.0->category_encoders) (1.0.1)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.5.1->category_encoders) (1.15.0)\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fKBQwUTGOkiH","executionInfo":{"status":"ok","timestamp":1617613175094,"user_tz":-540,"elapsed":12608,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"e2aa47aa-bf32-413b-b0a7-69d224938382"},"source":["import category_encoders as ce\n","\n","encoder = ce.basen.BaseNEncoder(cols='cid', base=3)\n","result = encoder.fit_transform(df)\n","\n","# check the result\n","columns = result.columns.tolist()\n","columns_name = [s for s in columns if \"cid\" in s]\n","\n","def get_cid_values(df, names, index):\n","    temp = []\n","    for name in names:\n","        temp.append(df[name][index])\n","    return temp\n","\n","for i in range(0, len(df), 5000):\n","    temp = get_cid_values(result, columns_name, i)\n","    print(df['cid'][i], temp)\n"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n","  import pandas.util.testing as tm\n","/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead\n","  elif pd.api.types.is_categorical(cols):\n"],"name":"stderr"},{"output_type":"stream","text":["UCZf__ehlCEBPop___sldpBUQ [0, 0, 0, 0, 0, 1]\n","UC6wKgAlOeFNqmXV167KERhQ [0, 0, 0, 1, 0, 1]\n","UC4lZ8vGPy8bwmKILb__YlhzQ [0, 0, 0, 2, 1, 0]\n","UCKtKKtjaaPKA1Oj8Ldnfsdg [0, 0, 1, 0, 1, 1]\n","UCdtFmWwPlKiCOEND_95fwiA [0, 0, 1, 1, 2, 0]\n","UC2RdeFmVA1PrDqmFqJMG7hA [0, 0, 1, 2, 2, 2]\n","UCO06KZjWOe6b1tXrgzzakZA [0, 0, 2, 1, 0, 0]\n","UCg_Wchs_AGoHrlayD_rhO0Q [0, 0, 2, 2, 0, 2]\n","UC__8H678xX1SNBOM10_ReY6Q [0, 1, 0, 0, 1, 1]\n","UC2rbyOa3Jo7vGSibqKcRjqw [0, 1, 0, 1, 2, 1]\n","UCPJOCEIyI3gxXbTqKSsViqg [0, 1, 1, 0, 0, 2]\n","UCrOnS768WQGgNzvM0wOGa1w [0, 1, 1, 1, 1, 2]\n","UCjX7kJYLEAdsaCDnTsWK3Wg [0, 1, 1, 2, 2, 0]\n","UCdb7Jw5rprurSCutjT9BW5A [0, 1, 2, 1, 1, 0]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"NekZN7juOkiI"},"source":["## 手法4：エビデンス重みエンコーディング(Weight of Evidence)\n","- category_encoder: [Weight of Evidence](http://contrib.scikit-learn.org/categorical-encoding/woe.html)\n","- [WEIGHT OF EVIDENCE (WOE) AND INFORMATION VALUE EXPLAINED](https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)\n","- [Weight of evidence and Information Value using Python](https://medium.com/@sundarstyles89/weight-of-evidence-and-information-value-using-python-6f05072e83eb)"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kB28yKXFOkiI","executionInfo":{"status":"ok","timestamp":1617613175370,"user_tz":-540,"elapsed":12877,"user":{"displayName":"TOMA Naruaki","photoUrl":"","userId":"11747312442870110137"}},"outputId":"46306ad6-f212-473a-d1db-c34c361ffbf0"},"source":["import category_encoders as ce\n","\n","encoder = ce.woe.WOEEncoder(cols='cid')\n","\n","# ready for evidence\n","target = df['viewCount'] > 10000\n","\n","# calculate WOE\n","result = encoder.fit_transform(df, y=target)\n","\n","# check the result\n","for i in range(0, len(df), 5000):\n","    print(df['cid'][i], '\\t', result['cid'][i])\n"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead\n","  elif pd.api.types.is_categorical(cols):\n"],"name":"stderr"},{"output_type":"stream","text":["UCZf__ehlCEBPop___sldpBUQ \t 3.6095662610015764\n","UC6wKgAlOeFNqmXV167KERhQ \t 0.4757909580572925\n","UC4lZ8vGPy8bwmKILb__YlhzQ \t 1.9921283059003998\n","UCKtKKtjaaPKA1Oj8Ldnfsdg \t 1.807796698178421\n","UCdtFmWwPlKiCOEND_95fwiA \t -1.76268251990698\n","UC2RdeFmVA1PrDqmFqJMG7hA \t -0.8357080911509357\n","UCO06KZjWOe6b1tXrgzzakZA \t 0.42925802928263423\n","UCg_Wchs_AGoHrlayD_rhO0Q \t 3.4570742314135954\n","UC__8H678xX1SNBOM10_ReY6Q \t 2.2192798786121486\n","UC2rbyOa3Jo7vGSibqKcRjqw \t 3.022177923131213\n","UCPJOCEIyI3gxXbTqKSsViqg \t -0.2641732825478688\n","UCrOnS768WQGgNzvM0wOGa1w \t -2.1435507408557353\n","UCjX7kJYLEAdsaCDnTsWK3Wg \t -1.794934067712545\n","UCdb7Jw5rprurSCutjT9BW5A \t -1.606225698959651\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"mKr2PNPlOkiJ"},"source":[""],"execution_count":null,"outputs":[]}]}