{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 授業評価アンケートのデータ収集用スクリプト\n",
    "[調査と解析班](https://r.st.ie.u-ryukyu.ac.jp/assessment/)で収集している、知能除法コース専門科目の2021年度前期科目を対象に自由記述欄のデータを収集。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  2455  100  2455    0     0  11135      0 --:--:-- --:--:-- --:--:-- 11418\n"
     ]
    }
   ],
   "source": [
    "!curl -O https://ie.u-ryukyu.ac.jp/~tnal/2022/dm/static/r_assesment_list.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>grade</th>\n",
       "      <th>required</th>\n",
       "      <th>year</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2021</td>\n",
       "      <td>https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>技術者の倫理</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2021</td>\n",
       "      <td>https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>工学基礎演習</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2021</td>\n",
       "      <td>https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>プログラミングⅠ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2021</td>\n",
       "      <td>https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>基礎数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>2021</td>\n",
       "      <td>https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      title  grade  required  year  \\\n",
       "0     工業数学Ⅰ      1      True  2021   \n",
       "1    技術者の倫理      1      True  2021   \n",
       "2    工学基礎演習      1      True  2021   \n",
       "3  プログラミングⅠ      1      True  2021   \n",
       "4     基礎数学Ⅰ      1     False  2021   \n",
       "\n",
       "                                                 url  \n",
       "0  https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...  \n",
       "1  https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...  \n",
       "2  https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...  \n",
       "3  https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...  \n",
       "4  https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_file = 'r_assesment_list.csv'\n",
    "assesment_columns = ['title', 'grade', 'required', 'year', 'url']\n",
    "df = pd.read_csv(source_file, names=assesment_columns)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'grade': 3, 'required': False, 'year': 2021, 'url': 'https://r.st.ie.u-ryukyu.ac.jp/assessment/2021a/result/makepage.php?kamoku=i334'}\n"
     ]
    }
   ],
   "source": [
    "lectures = {}\n",
    "for items in df.itertuples():\n",
    "    title = items[1]\n",
    "    grade = items[2]\n",
    "    required = items[3]\n",
    "    year = items[4]\n",
    "    url = items[5]\n",
    "    lectures[title] = {'grade':grade, 'required':required, 'year':year, 'url':url}\n",
    "\n",
    "print(lectures['データマイニング'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# urlから '&ex=1'を削除した状態でのセレクタ\n",
    "# Q21-(1), (2), (3), (4), Q22\n",
    "selectors = {'Q21 (1)':'body > div:nth-child(44) > ul:nth-child(1)',\n",
    "    'Q21 (2)': 'body > div:nth-child(47) > ul:nth-child(1)',\n",
    "    'Q21 (3)': 'body > div:nth-child(50) > ul:nth-child(1)',\n",
    "    'Q21 (4)': 'body > div:nth-child(53) > ul:nth-child(1)',\n",
    "    'Q22': 'body > div:nth-child(56) > ul:nth-child(1)'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>grade</th>\n",
       "      <th>required</th>\n",
       "      <th>q_id</th>\n",
       "      <th>comment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>Q21 (1)</td>\n",
       "      <td>特になし</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>Q21 (2)</td>\n",
       "      <td>正直わかりずらい。むだに間があるし。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>Q21 (2)</td>\n",
       "      <td>例題を取り入れて理解しやすくしてほしい。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>Q21 (2)</td>\n",
       "      <td>特になし</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>工業数学Ⅰ</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>Q21 (2)</td>\n",
       "      <td>スライドに書く文字をもう少しわかりやすくして欲しいです。</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   title  grade  required     q_id                       comment\n",
       "0  工業数学Ⅰ      1      True  Q21 (1)                          特になし\n",
       "1  工業数学Ⅰ      1      True  Q21 (2)            正直わかりずらい。むだに間があるし。\n",
       "2  工業数学Ⅰ      1      True  Q21 (2)          例題を取り入れて理解しやすくしてほしい。\n",
       "3  工業数学Ⅰ      1      True  Q21 (2)                          特になし\n",
       "4  工業数学Ⅰ      1      True  Q21 (2)  スライドに書く文字をもう少しわかりやすくして欲しいです。"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_comments(lectures, selectors):\n",
    "    \"\"\"授業コメントを収集\n",
    "    Returns dict:\n",
    "      {授業名: {'q_id': ['コメント1', 'コメント2']},\n",
    "       授業名: {'q_id': ['コメント1', 'コメント2']},,,}\n",
    "    \"\"\"\n",
    "    result = {}\n",
    "    for lec_name, items in lectures.items():\n",
    "        #print(lec_name, items['grade'])\n",
    "        r = requests.get(items['url'])\n",
    "        r.encoding = r.apparent_encoding\n",
    "        soup = BeautifulSoup(r.text, 'html.parser')\n",
    "\n",
    "        comments = {}\n",
    "        for q_id, selector in selectors.items():\n",
    "            elements = soup.select(selector)\n",
    "            #print(elements, '<=', items['url'], q_id, selector)\n",
    "            if len(elements) != 0:\n",
    "                for li in elements[0].find_all('li'):\n",
    "                    if q_id in comments:\n",
    "                        comments[q_id].append(li.text.rstrip())\n",
    "                    else:\n",
    "                        comments[q_id] = [li.text.rstrip()]\n",
    "        result[lec_name] = comments\n",
    "    return result\n",
    "\n",
    "def comment_to_dataframe(lectures, all_comments):\n",
    "    \"\"\"扱いやすいように変換\n",
    "    Returns pd.DataFrame:\n",
    "      columns = ['title', 'grade', 'required', 'q_id', 'comment']\n",
    "    \"\"\"\n",
    "    tables = []\n",
    "    for title, items in lectures.items():\n",
    "        grade = items['grade']\n",
    "        required = items['required']\n",
    "        for q_id, comments in all_comments[title].items():\n",
    "            for comment in comments:\n",
    "                if len(comment) != 0:\n",
    "                    tables.append([title, grade, required, q_id, comment])\n",
    "\n",
    "    columns_name = ['title', 'grade', 'required', 'q_id', 'comment']\n",
    "    df = pd.DataFrame(tables, columns=columns_name)\n",
    "    return df\n",
    "\n",
    "\n",
    "all_comments = get_comments(lectures, selectors)\n",
    "df = comment_to_dataframe(lectures, all_comments)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle('./corpus/r_assesment.pkl')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "880b2a8c90f9e6beae80b56829e3f671fedd58b6d14887184ddce26124cedfbd"
  },
  "kernelspec": {
   "display_name": "Python 3.8.9 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
