授業評価アンケートのデータ収集用スクリプト
授業評価アンケートのデータ収集用スクリプト¶
調査と解析班で収集している、知能除法コース専門科目の2021年度前期科目を対象に自由記述欄のデータを収集。
!curl -O https://ie.u-ryukyu.ac.jp/~tnal/2022/dm/static/r_assesment_list.csv
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 2455 100 2455 0 0 11135 0 --:--:-- --:--:-- --:--:-- 11418
import requests
import pandas as pd
from bs4 import BeautifulSoup
source_file = 'r_assesment_list.csv'
assesment_columns = ['title', 'grade', 'required', 'year', 'url']
df = pd.read_csv(source_file, names=assesment_columns)
df.head()
title | grade | required | year | url | |
---|---|---|---|---|---|
0 | 工業数学Ⅰ | 1 | True | 2021 | https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... |
1 | 技術者の倫理 | 1 | True | 2021 | https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... |
2 | 工学基礎演習 | 1 | True | 2021 | https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... |
3 | プログラミングⅠ | 1 | True | 2021 | https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... |
4 | 基礎数学Ⅰ | 1 | False | 2021 | https://r.st.ie.u-ryukyu.ac.jp/assessment/2021... |
lectures = {}
for items in df.itertuples():
title = items[1]
grade = items[2]
required = items[3]
year = items[4]
url = items[5]
lectures[title] = {'grade':grade, 'required':required, 'year':year, 'url':url}
print(lectures['データマイニング'])
{'grade': 3, 'required': False, 'year': 2021, 'url': 'https://r.st.ie.u-ryukyu.ac.jp/assessment/2021a/result/makepage.php?kamoku=i334'}
# urlから '&ex=1'を削除した状態でのセレクタ
# Q21-(1), (2), (3), (4), Q22
selectors = {'Q21 (1)':'body > div:nth-child(44) > ul:nth-child(1)',
'Q21 (2)': 'body > div:nth-child(47) > ul:nth-child(1)',
'Q21 (3)': 'body > div:nth-child(50) > ul:nth-child(1)',
'Q21 (4)': 'body > div:nth-child(53) > ul:nth-child(1)',
'Q22': 'body > div:nth-child(56) > ul:nth-child(1)'}
def get_comments(lectures, selectors):
"""授業コメントを収集
Returns dict:
{授業名: {'q_id': ['コメント1', 'コメント2']},
授業名: {'q_id': ['コメント1', 'コメント2']},,,}
"""
result = {}
for lec_name, items in lectures.items():
#print(lec_name, items['grade'])
r = requests.get(items['url'])
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')
comments = {}
for q_id, selector in selectors.items():
elements = soup.select(selector)
#print(elements, '<=', items['url'], q_id, selector)
if len(elements) != 0:
for li in elements[0].find_all('li'):
if q_id in comments:
comments[q_id].append(li.text.rstrip())
else:
comments[q_id] = [li.text.rstrip()]
result[lec_name] = comments
return result
def comment_to_dataframe(lectures, all_comments):
"""扱いやすいように変換
Returns pd.DataFrame:
columns = ['title', 'grade', 'required', 'q_id', 'comment']
"""
tables = []
for title, items in lectures.items():
grade = items['grade']
required = items['required']
for q_id, comments in all_comments[title].items():
for comment in comments:
if len(comment) != 0:
tables.append([title, grade, required, q_id, comment])
columns_name = ['title', 'grade', 'required', 'q_id', 'comment']
df = pd.DataFrame(tables, columns=columns_name)
return df
all_comments = get_comments(lectures, selectors)
df = comment_to_dataframe(lectures, all_comments)
df.head()
title | grade | required | q_id | comment | |
---|---|---|---|---|---|
0 | 工業数学Ⅰ | 1 | True | Q21 (1) | 特になし |
1 | 工業数学Ⅰ | 1 | True | Q21 (2) | 正直わかりずらい。むだに間があるし。 |
2 | 工業数学Ⅰ | 1 | True | Q21 (2) | 例題を取り入れて理解しやすくしてほしい。 |
3 | 工業数学Ⅰ | 1 | True | Q21 (2) | 特になし |
4 | 工業数学Ⅰ | 1 | True | Q21 (2) | スライドに書く文字をもう少しわかりやすくして欲しいです。 |
df.to_pickle('./corpus/r_assesment.pkl')