授業評価アンケートのデータ収集用スクリプト#

調査と解析班で収集している、知能除法コース専門科目の2021年度前期科目を対象に自由記述欄のデータを収集。

!curl -O https://ie.u-ryukyu.ac.jp/~tnal/2022/dm/static/r_assesment_list.csv
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2455  100  2455    0     0  11135      0 --:--:-- --:--:-- --:--:-- 11418
import requests
import pandas as pd
from bs4 import BeautifulSoup
source_file = 'r_assesment_list.csv'
assesment_columns = ['title', 'grade', 'required', 'year', 'url']
df = pd.read_csv(source_file, names=assesment_columns)
df.head()
title grade required year url
0 工業数学Ⅰ 1 True 2021 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
1 技術者の倫理 1 True 2021 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
2 工学基礎演習 1 True 2021 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
3 プログラミングⅠ 1 True 2021 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
4 基礎数学Ⅰ 1 False 2021 https://r.st.ie.u-ryukyu.ac.jp/assessment/2021...
lectures = {}
for items in df.itertuples():
    title = items[1]
    grade = items[2]
    required = items[3]
    year = items[4]
    url = items[5]
    lectures[title] = {'grade':grade, 'required':required, 'year':year, 'url':url}

print(lectures['データマイニング'])
{'grade': 3, 'required': False, 'year': 2021, 'url': 'https://r.st.ie.u-ryukyu.ac.jp/assessment/2021a/result/makepage.php?kamoku=i334'}
# urlから '&ex=1'を削除した状態でのセレクタ
# Q21-(1), (2), (3), (4), Q22
selectors = {'Q21 (1)':'body > div:nth-child(44) > ul:nth-child(1)',
    'Q21 (2)': 'body > div:nth-child(47) > ul:nth-child(1)',
    'Q21 (3)': 'body > div:nth-child(50) > ul:nth-child(1)',
    'Q21 (4)': 'body > div:nth-child(53) > ul:nth-child(1)',
    'Q22': 'body > div:nth-child(56) > ul:nth-child(1)'}
def get_comments(lectures, selectors):
    """授業コメントを収集
    Returns dict:
      {授業名: {'q_id': ['コメント1', 'コメント2']},
       授業名: {'q_id': ['コメント1', 'コメント2']},,,}
    """
    result = {}
    for lec_name, items in lectures.items():
        #print(lec_name, items['grade'])
        r = requests.get(items['url'])
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, 'html.parser')

        comments = {}
        for q_id, selector in selectors.items():
            elements = soup.select(selector)
            #print(elements, '<=', items['url'], q_id, selector)
            if len(elements) != 0:
                for li in elements[0].find_all('li'):
                    if q_id in comments:
                        comments[q_id].append(li.text.rstrip())
                    else:
                        comments[q_id] = [li.text.rstrip()]
        result[lec_name] = comments
    return result

def comment_to_dataframe(lectures, all_comments):
    """扱いやすいように変換
    Returns pd.DataFrame:
      columns = ['title', 'grade', 'required', 'q_id', 'comment']
    """
    tables = []
    for title, items in lectures.items():
        grade = items['grade']
        required = items['required']
        for q_id, comments in all_comments[title].items():
            for comment in comments:
                if len(comment) != 0:
                    tables.append([title, grade, required, q_id, comment])

    columns_name = ['title', 'grade', 'required', 'q_id', 'comment']
    df = pd.DataFrame(tables, columns=columns_name)
    return df


all_comments = get_comments(lectures, selectors)
df = comment_to_dataframe(lectures, all_comments)
df.head()
title grade required q_id comment
0 工業数学Ⅰ 1 True Q21 (1) 特になし
1 工業数学Ⅰ 1 True Q21 (2) 正直わかりずらい。むだに間があるし。
2 工業数学Ⅰ 1 True Q21 (2) 例題を取り入れて理解しやすくしてほしい。
3 工業数学Ⅰ 1 True Q21 (2) 特になし
4 工業数学Ⅰ 1 True Q21 (2) スライドに書く文字をもう少しわかりやすくして欲しいです。
df.to_pickle('./corpus/r_assesment.pkl')