{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 授業評価アンケートのデータ収集用スクリプト\n", "[調査と解析班](https://r.st.ie.u-ryukyu.ac.jp/assessment/)で収集している、知能除法コース専門科目の2021年度前期科目を対象に自由記述欄のデータを収集。" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 2455 100 2455 0 0 11135 0 --:--:-- --:--:-- --:--:-- 11418\n" ] } ], "source": [ "!curl -O https://ie.u-ryukyu.ac.jp/~tnal/2022/dm/static/r_assesment_list.csv" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegraderequiredyearurl
0工業数学Ⅰ1True2021https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
1技術者の倫理1True2021https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
2工学基礎演習1True2021https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
3プログラミングⅠ1True2021https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
4基礎数学Ⅰ1False2021https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
\n", "
" ], "text/plain": [ " title grade required year \\\n", "0 工業数学Ⅰ 1 True 2021 \n", "1 技術者の倫理 1 True 2021 \n", "2 工学基礎演習 1 True 2021 \n", "3 プログラミングⅠ 1 True 2021 \n", "4 基礎数学Ⅰ 1 False 2021 \n", "\n", " url \n", "0 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... \n", "1 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... \n", "2 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... \n", "3 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... \n", "4 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "source_file = 'r_assesment_list.csv'\n", "assesment_columns = ['title', 'grade', 'required', 'year', 'url']\n", "df = pd.read_csv(source_file, names=assesment_columns)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'grade': 3, 'required': False, 'year': 2021, 'url': 'https://r.st.ie.u-ryukyu.ac.jp/assessment/2021a/result/makepage.php?kamoku=i334'}\n" ] } ], "source": [ "lectures = {}\n", "for items in df.itertuples():\n", " title = items[1]\n", " grade = items[2]\n", " required = items[3]\n", " year = items[4]\n", " url = items[5]\n", " lectures[title] = {'grade':grade, 'required':required, 'year':year, 'url':url}\n", "\n", "print(lectures['データマイニング'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# urlから '&ex=1'を削除した状態でのセレクタ\n", "# Q21-(1), (2), (3), (4), Q22\n", "selectors = {'Q21 (1)':'body > div:nth-child(44) > ul:nth-child(1)',\n", " 'Q21 (2)': 'body > div:nth-child(47) > ul:nth-child(1)',\n", " 'Q21 (3)': 'body > div:nth-child(50) > ul:nth-child(1)',\n", " 'Q21 (4)': 'body > div:nth-child(53) > ul:nth-child(1)',\n", " 'Q22': 'body > div:nth-child(56) > ul:nth-child(1)'}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegraderequiredq_idcomment
0工業数学Ⅰ1TrueQ21 (1)特になし
1工業数学Ⅰ1TrueQ21 (2)正直わかりずらい。むだに間があるし。
2工業数学Ⅰ1TrueQ21 (2)例題を取り入れて理解しやすくしてほしい。
3工業数学Ⅰ1TrueQ21 (2)特になし
4工業数学Ⅰ1TrueQ21 (2)スライドに書く文字をもう少しわかりやすくして欲しいです。
\n", "
" ], "text/plain": [ " title grade required q_id comment\n", "0 工業数学Ⅰ 1 True Q21 (1) 特になし\n", "1 工業数学Ⅰ 1 True Q21 (2) 正直わかりずらい。むだに間があるし。\n", "2 工業数学Ⅰ 1 True Q21 (2) 例題を取り入れて理解しやすくしてほしい。\n", "3 工業数学Ⅰ 1 True Q21 (2) 特になし\n", "4 工業数学Ⅰ 1 True Q21 (2) スライドに書く文字をもう少しわかりやすくして欲しいです。" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_comments(lectures, selectors):\n", " \"\"\"授業コメントを収集\n", " Returns dict:\n", " {授業名: {'q_id': ['コメント1', 'コメント2']},\n", " 授業名: {'q_id': ['コメント1', 'コメント2']},,,}\n", " \"\"\"\n", " result = {}\n", " for lec_name, items in lectures.items():\n", " #print(lec_name, items['grade'])\n", " r = requests.get(items['url'])\n", " r.encoding = r.apparent_encoding\n", " soup = BeautifulSoup(r.text, 'html.parser')\n", "\n", " comments = {}\n", " for q_id, selector in selectors.items():\n", " elements = soup.select(selector)\n", " #print(elements, '<=', items['url'], q_id, selector)\n", " if len(elements) != 0:\n", " for li in elements[0].find_all('li'):\n", " if q_id in comments:\n", " comments[q_id].append(li.text.rstrip())\n", " else:\n", " comments[q_id] = [li.text.rstrip()]\n", " result[lec_name] = comments\n", " return result\n", "\n", "def comment_to_dataframe(lectures, all_comments):\n", " \"\"\"扱いやすいように変換\n", " Returns pd.DataFrame:\n", " columns = ['title', 'grade', 'required', 'q_id', 'comment']\n", " \"\"\"\n", " tables = []\n", " for title, items in lectures.items():\n", " grade = items['grade']\n", " required = items['required']\n", " for q_id, comments in all_comments[title].items():\n", " for comment in comments:\n", " if len(comment) != 0:\n", " tables.append([title, grade, required, q_id, comment])\n", "\n", " columns_name = ['title', 'grade', 'required', 'q_id', 'comment']\n", " df = pd.DataFrame(tables, columns=columns_name)\n", " return df\n", "\n", "\n", "all_comments = get_comments(lectures, selectors)\n", "df = comment_to_dataframe(lectures, all_comments)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df.to_pickle('./corpus/r_assesment.pkl')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "880b2a8c90f9e6beae80b56829e3f671fedd58b6d14887184ddce26124cedfbd" }, "kernelspec": { "display_name": "Python 3.8.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }