from __future__ import annotations
import asyncio
import hashlib
import math
import pickle
import re
from asyncio import create_task
from dataclasses import dataclass
from decimal import Decimal
from enum import Enum
from inspect import isawaitable
from logging import Logger, getLogger
from pathlib import Path
from typing import (
AsyncIterable,
Awaitable,
Callable,
Iterable,
NamedTuple,
Optional,
TypeVar,
Union,
Any,
)
import aiofiles
import aiohttp
from bs4 import BeautifulSoup, ResultSet, Tag
from pandas import DataFrame
from tenacity import WrappedFn, retry
from tenacity.before_sleep import before_sleep_log
from tenacity.stop import stop_after_attempt, stop_after_delay
from tenacity.wait import wait_exponential
from tqdm import tqdm
from datetime import timedelta
from .common import RateLimitter, Language
from .pandas import to_dataframe
from ut_course_catalog.common import BASE_URL, Semester, Weekday
[docs]class Institution(Enum):
"""Institution in the University of Tokyo."""
学部前期課程 = "jd"
"""Junior Division"""
学部後期課程 = "ug"
"""Senior Division"""
大学院 = "g"
"""Graduate"""
All = "all"
[docs]class Faculty(Enum):
"""Faculty in the University of Tokyo."""
法学部 = 1
医学部 = 2
工学部 = 3
文学部 = 4
理学部 = 5
農学部 = 6
経済学部 = 7
教養学部 = 8
教育学部 = 9
薬学部 = 10
人文社会系研究科 = 11
教育学研究科 = 12
法学政治学研究科 = 13
経済学研究科 = 14
総合文化研究科 = 15
理学系研究科 = 16
工学系研究科 = 17
農学生命科学研究科 = 18
医学系研究科 = 19
薬学系研究科 = 20
数理科学研究科 = 21
新領域創成科学研究科 = 22
情報理工学系研究科 = 23
学際情報学府 = 24
公共政策学教育部 = 25
教養学部前期課程 = 26
[docs] @classmethod
def value_of(cls, value) -> "Faculty":
"""Converts a commonly used expression in the website to a Faculty enum value."""
for k, v in cls.__members__.items():
if k == value:
return v
if value == "教養学部(前期課程)":
return cls.教養学部前期課程
else:
raise ValueError(f"'{cls.__name__}' enum not found for '{value}'")
[docs]class CommonCode(str):
@property
def institution(self) -> Institution:
try:
return {
"C": Institution.学部前期課程,
"F": Institution.学部後期課程,
"G": Institution.大学院,
}[self[0]]
except:
pass
@property
def faculty(self) -> Faculty:
code = self[1:3]
g_faculties = {
"HS": Faculty.人文社会系研究科,
"LP": Faculty.法学政治学研究科,
"AS": Faculty.総合文化研究科,
"SC": Faculty.理学系研究科,
"EN": Faculty.工学系研究科,
"AG": Faculty.農学生命科学研究科,
"ME": Faculty.医学系研究科,
"PH": Faculty.薬学系研究科,
"MA": Faculty.数理科学研究科,
"FS": Faculty.新領域創成科学研究科,
"IF": Faculty.情報理工学系研究科,
"II": Faculty.学際情報学府,
"PP": Faculty.公共政策学教育部,
}
ug_faculties = {
"LA": Faculty.法学部,
"ME": Faculty.医学部,
"EN": Faculty.工学部,
"LE": Faculty.文学部,
"SC": Faculty.理学部,
"AG": Faculty.農学部,
"EC": Faculty.経済学部,
"AS": Faculty.教養学部,
"ED": Faculty.教育学部,
"PH": Faculty.薬学部,
}
if self.institution == Institution.学部前期課程:
if code == "AS":
return Faculty.教養学部前期課程
if self.institution == Institution.大学院:
if code in g_faculties:
return g_faculties[code]
if code in ug_faculties:
return ug_faculties[code]
else:
if code in ug_faculties:
return ug_faculties[code]
if code in g_faculties:
return g_faculties[code]
raise RuntimeWarning(f"Unknown faculty code: {code}")
@property
def department_code(self) -> str:
try:
return self[4:6]
except:
pass
@property
def level(self) -> str:
try:
return self[6]
except:
pass
@property
def reference_number(self) -> str:
try:
return self[7:10]
except:
pass
@property
def class_form(self) -> ClassForm:
try:
return {
"L": ClassForm.講義,
"S": ClassForm.演習,
"E": ClassForm.実験,
"P": ClassForm.実習,
"T": ClassForm.卒業論文,
"Z": ClassForm.その他,
}[self[10]]
except:
pass
@property
def language(self) -> Language:
try:
return {
1: Language.Japanese,
2: Language.JapaneseAndEnglish,
3: Language.English,
4: Language.OtherLanguagesToo,
5: Language.OnlyOtherLanguages,
9: Language.Others,
}[int(self[11])]
except:
pass
@property
def department_name(self) -> str:
return CommonCode.parse_department(self.faculty, self.department_code)
@property
def small_category(self) -> str:
if self.reference_number:
return self.reference_number[1:3]
@property
def middle_category(self) -> str:
if self.reference_number:
return self.reference_number[0]
@property
def large_category(self) -> str:
return self.department_code
def _asdict(self) -> dict[str, Any]:
return {
"課程": self.institution,
"学部": self.faculty,
"学科": self.department_name,
"学科コード": self.department_code,
"レベル": self.level,
"整理番号": self.reference_number,
"授業形態": self.class_form,
"講義使用言語": self.language,
"小分類": self.small_category,
"中分類": self.middle_category,
"大分類": self.large_category,
}
def _asdict_en(self) -> dict[str, Any]:
return {
"institution": self.institution,
"faculty": self.faculty,
"department_code": self.department_code,
"level": self.level,
"reference_number": self.reference_number,
"class_form": self.class_form,
"language": self.language,
"department_name": self.department_name,
"large_category": self.large_category,
"middle_category": self.middle_category,
"small_category": self.small_category,
}
[docs] @staticmethod
def parse_department(faculty: Faculty, department_code: str):
d = {
Faculty.教養学部前期課程: {
"FC": "基礎科目",
"IC": "展開科目",
"GC": "総合科目",
"TC": "主題科目",
"PF": "基礎科目(PEAK)",
"PI": "展開科目(PEAK)",
"PG": "総合科目(PEAK)",
"PT": "主題科目(PEAK)",
},
Faculty.法学部: {
"CO": "共通科目",
"PL": "実定法系科目",
"BL": "基礎法学系科目",
"PS": "政治系科目",
"EC": "経済系科目",
"SE": "演習科目",
},
Faculty.医学部: {"ME": "医学科", "IE": "健康総合科学科"},
Faculty.工学部: {
"CO": "共通科目",
"JL": "日本語教育部門",
"CE": "社会基盤学科",
"AR": "建築学科",
"UE": "都市工学科",
"MX": "機械系",
"ME": "機械工学科",
"MI": "機械情報工学科",
"AA": "航空宇宙工学科",
"PE": "精密工学科",
"EE": "電子・情報系",
"AM": "応用物理系",
"AP": "物理工学科",
"MP": "計数工学科",
"MA": "マテリアル工学科",
"CH": "化学・生命系",
"CA": "応用化学科",
"CS": "化学システム工学科",
"CB": "化学生命工学科",
"SI": "システム創成学科",
"SA": "環境・エネルギーシステムコース",
"SB": "システムデザイン&マネジメントコース",
"SC": "知能社会システムコース",
},
Faculty.文学部: {
"HU": "人文学科",
"XX": "専修課程以外",
},
Faculty.理学部: {
"MA": "数学科",
"IS": "情報科学科",
"PH": "物理学科",
"AS": "天文学科",
"EP": "地球惑星物理学科",
"EE": "地球惑星環境学科",
"CH": "化学科",
"BC": "生物化学科",
"BS": "生物学科",
"BI": "生物情報科学科",
"CC": "理学部共通科目",
},
Faculty.農学部: {
"MC": "生命化学・工学専修",
"MB": "応用生物学専修",
"MF": "森林生物科学専修/森林環境資源科学専修",
"MQ": "水圏生物科学専修",
"MA": "動物生命システム科学専修",
"MM": "生物素材科学専修",
"ML": "緑地環境学専修",
"MW": "木質構造科学専修",
"MG": "生物・環境工学専修",
"ME": "農業・資源経済学専修",
"MS": "フィールド科学専修",
"MI": "国際開発農学専修",
"MV": "獣医学専修",
"CC": "共通",
"CL": "応用生命科学課程",
"CE": "環境資源学課程",
"CV": "獣医学専修",
},
Faculty.経済学部: {
"EC": "経済学",
"ST": "統計学",
"AS": "地域研究",
"EH": "経済史",
"MA": "経営学",
"QF": "数量ファイナンス",
"WW": "その他",
},
Faculty.教養学部: {
"AA": "言語共通科目",
"BA": "言語専門科目",
"CA": "教養学科",
"DA": "学際科学科",
"EA": "統合自然科学科",
"FA": "学融合プログラム",
"GA": "教職科目",
"HA": "特設科目",
"XA": "高度教養科目",
},
Faculty.教育学部: {
"IE": "総合教育科学科",
"BT": "基礎教育学コース",
"SS": "教育社会科学専修",
"SO": "比較教育社会学コース",
"PP": "教育実践・政策学コース",
"DS": "心身発達科学専修",
"EP": "教育心理学コース",
"PH": "身体教育学コース",
},
Faculty.薬学部: {
"SH": "薬科学科/薬学科",
"PS": "薬科学科",
"PH": "薬学科",
},
Faculty.理学系研究科: {
"PH": "物理学専攻",
"AS": "天文学専攻",
"EP": "地球惑星科学専攻",
"EE": "地球惑星環境学科",
"CH": "化学専攻",
"BC": "生物化学科",
"BS": "生物科学専攻",
"BI": "生物情報科学科",
"CC": "理学部共通科目",
},
Faculty.教育学研究科: {
"IE": "総合教育科学専攻",
"AS": "学校教育高度化専攻",
"ZZ": "その他",
},
Faculty.人文社会系研究科: {
"GC": "基礎文化研究専攻",
"JS": "日本文化研究専攻",
"EA": "欧米系文化研究専攻",
"AS": "アジア文化研究専攻",
"SC": "社会文化研究専攻",
"CR": "文化資源学研究専攻",
"KS": "韓国朝鮮文化研究専攻",
"XX": "共通科目",
},
Faculty.法学政治学研究科: {
"LP": "総合法政専攻",
"LS": "法曹養成専攻",
},
Faculty.経済学研究科: {
"EC": "経済学研究科",
},
Faculty.総合文化研究科: {
"LI": "言語情報科学専攻",
"IC": "超域文化科学専攻",
"AS": "地域文化研究専攻",
"SI": "国際社会科学専攻",
"LS": "広域科学専攻 生命環境科学系",
"SS": "広域科学専攻 広域システム科学系",
"BS": "広域科学専攻 相関基礎科学系",
"HS": "「人間の安全保障」プログラム",
"EU": "欧州研究プログラム",
"GH": "グローバル共生プログラム",
"IH": "多文化共生・統合人間学プログラム",
"GS": "国際人材養成プログラム",
"ES": "国際環境学プログラム",
"GW": "グローバル・スタディーズ・イニシアティヴ国際卓越大学院",
"WA": "先進基礎科学推進国際卓越大学院",
"IT": "科学技術インタープリター養成プログラム",
"IG": "日独共同大学院プログラム",
"EE": "英語教育プログラム",
},
Faculty.工学系研究科: {"": ""},
Faculty.農学生命科学研究科: {
"CC": "共通",
"AB": "生産・環境生物学",
"AC": "応用生命化学",
"BT": "応用生命工学",
"FS": "森林科学",
"AQ": "水圏生物科学",
"AE": "農業・資源経済学",
"BE": "生物・環境工学",
"BM": "生物材料科学",
"WA": "生物材料科学・木造建築コース",
"GA": "農学国際",
"IP": "農学国際・国際農業開発学コース",
"ES": "生圏システム学",
"AS": "応用動物科学",
"VM": "獣医学",
"MS": "副専攻",
},
Faculty.医学系研究科: {
"MC": "分子細胞生物学",
"FB": "機能生物学",
"PA": "病因・病理学",
"RB": "生体物理医学",
"NS": "脳神経医学",
"SM": "社会医学",
"IM": "内科学",
"RE": "生殖・発達・加齢医学",
"SS": "外科学",
"HN": "健康科学・看護学",
"PN": "健康科学・看護学 保健師コース",
"NU": "健康科学・看護学 専門看護師コース",
"PE": "健康科学・看護学 保健師教育コース",
"MW": "健康科学・看護学 助産師教育コース",
"IH": "国際保健学",
"MH": "医科学",
"PH": "公共健康医学",
"ML": "医学共通科目",
"GP": "医学共通科目(がんプロフェショナル養成プラン)",
"PL": "GPLLI(リーディング大学院)",
"LS": "生命科学技術国際卓越大学院(ライフサイエンスコース)",
"BE": "生命科学技術国際卓越大学院(生体医工学コース)",
},
Faculty.薬学系研究科: {
"SH": "薬科学専攻/薬学専攻",
"PS": "薬科学専攻",
"PH": "薬学専攻",
"WL": "生命科学技術国際卓越大学院 WINGS-LST",
},
Faculty.数理科学研究科: {"MA": "数理科学研究科"},
Faculty.情報理工学系研究科: {
"CS": "コンピュータ科学",
"MA": "数理情報学",
"IP": "システム情報学",
"IC": "電子情報学",
"MX": "知能機械情報学",
"CI": "創造情報学",
"CO": "共通科目",
},
Faculty.新領域創成科学研究科: {
"OC": "全学開放科目",
"CC": "新領域創成科学研究科共通科目",
"EC": "環境学研究系共通科目",
"AM": "物質系専攻",
"AE": "先端エネルギー工学専攻",
"CS": "複雑理工学専攻",
"IB": "先端生命科学専攻",
"MJ": "メディカル情報生命専攻",
"NE": "自然環境学専攻",
"OT": "海洋技術環境学専攻",
"ES": "環境システム学専攻",
"HE": "人間環境学専攻",
"SC": "社会文化環境学専攻",
"IS": "国際協力学専攻",
"SS": "サステイナビリティ学グローバルリーダー養成大学院プログラム",
},
Faculty.学際情報学府: {
"SC": "社会情報学コース",
"CH": "文化・人間情報学コース",
"ED": "先端表現情報学コース",
"AC": "総合分析情報学コース",
"IA": "アジア情報社会コース",
"BS": "生物統計情報学コース",
"RS": "学際情報学専攻(必修)",
"CS": "学際情報学専攻(共通)",
"WS": "学際情報学専攻(横断)",
},
Faculty.公共政策学教育部: {
"DP": "国際公共政策学専攻",
"MP": "公共政策学専攻",
},
}
return d[faculty].get(department_code, department_code)
[docs]class SearchResultItem(NamedTuple):
"""Summary of a course in search results. Call `fetch_details` to get more information."""
時間割コード: str
共通科目コード: CommonCode
コース名: str
教員: str
学期: set[Semester]
曜限: set[tuple[Weekday, int]]
ねらい: str
[docs]class SearchResult(NamedTuple):
"""Result of a search query."""
items: list[SearchResultItem]
current_items_first_index: int
current_items_last_index: int
current_items_count: int
total_items_count: int
current_page: int
total_pages: int
[docs]class Details(NamedTuple):
"""Details of a course. Contains all available information for a course on the website. (UTAS may have more information)"""
時間割コード: str
共通科目コード: CommonCode
コース名: str
教員: str
学期: set[Semester]
曜限: set[tuple[Weekday, int]]
ねらい: str
教室: str
単位数: Decimal
他学部履修可: bool
講義使用言語: str
実務経験のある教員による授業科目: bool
開講所属: Faculty
授業計画: Optional[str]
授業の方法: Optional[str]
成績評価方法: Optional[str]
教科書: Optional[str]
参考書: Optional[str]
履修上の注意: Optional[str]
T = TypeVar("T")
IterableOrType = Union[Iterable[T], T]
OptionalIterableOrType = Optional[IterableOrType[T]]
[docs]@dataclass
class SearchParams:
"""Search query parameters."""
keyword: Optional[str] = None
課程: Institution = Institution.All
開講所属: Optional[Faculty] = None
学年: OptionalIterableOrType[int] = None
"""AND search, not OR."""
学期: OptionalIterableOrType[Semester] = None
"""AND search, not OR."""
曜日: OptionalIterableOrType[Weekday] = None
"""AND search, not OR. Few courses have multiple periods."""
時限: OptionalIterableOrType[int] = None
"""AND search, not OR. Few courses have multiple periods."""
講義使用言語: OptionalIterableOrType[str] = None
"""AND search, not OR."""
横断型教育プログラム: OptionalIterableOrType[str] = None
"""AND search, not OR."""
実務経験のある教員による授業科目: OptionalIterableOrType[bool] = None
"""AND search, not OR. Do not specify [True, False] though it is valid."""
分野_NDC: OptionalIterableOrType[str] = None
"""AND search, not OR."""
[docs] def id(self) -> str:
return hashlib.sha256(str(self).encode()).hexdigest()
def _format(text: str) -> str:
"""Utility function for removing unnecessary whitespaces."""
table = str.maketrans(" ", " ", " \n\r\t")
return text.translate(table)
def _format_description(text: str) -> str:
# delete spaces at first and last
text = re.sub(r"^\s+", "", text)
text = re.sub(r"\s+$", "", text)
# table = str.maketrans("", "", "\r\n\t")
# text = text.translate(table)
return text
def _ensure_found(obj: object) -> Tag:
if type(obj) is not Tag:
raise ParserError(f"{obj} not found")
return obj
def _parse_weekday_period(period_text: str) -> set[tuple[Weekday, int]]:
period_text = _format(period_text)
# if period_text == "集中":
# Most complex case:"S1: 集中、A1: 月曜3限 他"
if ":" in period_text:
return set()
# Ignore others if period_text contains "集中"
if "集中" in period_text:
return set()
period_texts = period_text.split("、")
def parse_one(period: str):
w = Weekday([weekday in period for weekday in list("月火水木金土日")].index(True))
reres = re.search(r"\d+", period)
if not reres:
# raise ValueError(f"Invalid period: {period}")
return None
p = int(reres.group())
return w, p
result = set()
for item in period_texts:
if not item:
return set()
result.add(parse_one(item))
return result
async def _await_if_future(obj: object) -> object:
if isawaitable(obj):
return await obj
return obj
[docs]class ParserError(Exception):
pass
[docs]class UTCourseCatalog:
"""A parser for the [UTokyo Online Course Catalogue](https://catalog.he.u-tokyo.ac.jp)."""
session: Optional[aiohttp.ClientSession]
_logger: Logger
_rate_limitter: RateLimitter
def __init__(
self, logger_level: int = 0, min_interval: Union[timedelta, int] = 1
) -> None:
self.session = None
self._logger = getLogger(__name__)
self._logger.setLevel(logger_level)
self._rate_limitter = RateLimitter(min_interval=min_interval)
async def __aenter__(self):
if self.session:
raise RuntimeError("__aenter__ called twice")
self.session = aiohttp.ClientSession()
await self.session.__aenter__()
return self
async def __aexit__(self, exc_type, exc, tb):
self._check_client()
assert self.session
await self.session.__aexit__(exc_type, exc, tb)
def _check_client(self):
if not self.session:
raise RuntimeError("__aenter__ not called")
[docs] async def fetch_search(self, params: SearchParams, page: int = 1) -> SearchResult:
"""Fetch search results from the website.
Parameters
----------
params : SearchParams
Search parameters.
page : int, optional
page number, by default 1
Returns
-------
SearchResult
Search results.
Raises
------
ParserError
Raises when failed to parse the website.
"""
self._check_client()
assert self.session
# See: https://github.com/34j/ut-course-catalog-swagger/blob/master/swagger.yaml
# build query
_params = {
"type": params.課程.value,
"page": page,
}
if params.keyword:
_params["q"] = params.keyword
if params.開講所属:
_params["faculty_id"] = params.開講所属.value
def iterable_or_type_to_iterable(
x: IterableOrType[T],
) -> Iterable[T]:
if isinstance(x, Iterable):
return x
return [x]
# build facet query
facet = {}
if params.横断型教育プログラム:
facet["uwide_cross_program_codes"] = iterable_or_type_to_iterable(
params.横断型教育プログラム
)
if params.学年:
facet["grades_codes"] = iterable_or_type_to_iterable(params.学年)
if params.学期:
facet["semester_codes"] = [
s.value for s in iterable_or_type_to_iterable(params.学期)
]
if params.時限:
facet["period_codes"] = [
x - 1 for x in iterable_or_type_to_iterable(params.時限)
]
if params.曜日 is not None:
facet["wday_codes"] = [
x.value * 100 + 1000 for x in iterable_or_type_to_iterable(params.曜日)
]
if params.講義使用言語:
facet["course_language_codes"] = iterable_or_type_to_iterable(params.講義使用言語)
if params.実務経験のある教員による授業科目:
facet["operational_experience_flag"] = iterable_or_type_to_iterable(
params.実務経験のある教員による授業科目
)
if params.分野_NDC:
# subject_code is not typo, it is a typo in the API
facet["subject_code"] = iterable_or_type_to_iterable(params.分野_NDC)
facet = {k: [str(x) for x in v] for k, v in facet.items()}
if facet:
_params["facet"] = str(facet).replace("'", '"').replace(" ", "")
# fetch website
await self._rate_limitter.wait()
async with self.session.get(BASE_URL + "result", params=_params) as response:
# parse website
soup = BeautifulSoup(await response.text(), "html.parser")
# get page info first
page_info_element = soup.find(class_="catalog-total-search-result")
if not page_info_element:
# not found
return SearchResult(
items=[],
current_items_count=0,
total_items_count=0,
current_items_first_index=0,
current_items_last_index=0,
current_page=0,
total_pages=0,
)
page_info_text = _format(page_info_element.text)
page_info_match: list[str] = re.findall(r"\d+", page_info_text)
current_items_first_index = int(page_info_match[0])
current_items_last_index = int(page_info_match[1])
current_items_count = (
current_items_last_index - current_items_first_index + 1
)
total_items_count = int(page_info_match[2])
total_pages = math.ceil(total_items_count / 10)
def get_items() -> Iterable[SearchResultItem]:
"""Get search result items."""
container = soup.find(
"div", class_="catalog-search-result-card-container"
)
if container is None:
return
if type(container) is not Tag:
raise ParserError(f"container not found: {container}")
cards = container.find_all("div", class_="catalog-search-result-card")
for card in cards:
cells_parent: Tag = card.find_all(
class_="catalog-search-result-table-row"
)[1]
if not cells_parent:
continue
def get_cell(name: str) -> Tag:
cell = cells_parent.find("div", class_=f"{name}-cell")
if type(cell) is not Tag:
raise ParserError(f"cell not found: {name}")
return cell
def get_cell_text(name: str) -> str:
cell = get_cell(name)
return _format(cell.text)
code_cell = _ensure_found(cells_parent.find(class_="code-cell"))
code_cell_children = list(code_cell.children)
yield SearchResultItem(
ねらい=_format_description(
card.find(
class_="catalog-search-result-card-body-text"
).text
),
時間割コード=code_cell_children[1].text,
共通科目コード=CommonCode(code_cell_children[3].text),
コース名=get_cell_text("name"),
教員=get_cell_text("lecturer"),
学期=set(
[
Semester(el.text.replace(" ", "").replace("\n", ""))
for el in get_cell("semester").find_all(
class_="catalog-semester-icon"
)
]
),
曜限=set(_parse_weekday_period(get_cell_text("period"))),
)
items = list(get_items())
if page != total_pages:
if len(items) != 10:
raise ParserError("items count is not 10")
if len(items) != current_items_count:
raise ParserError("items count is not current_items_count")
if page != current_items_first_index // 10 + 1:
raise ParserError("page number is not correct")
return SearchResult(
items=list(get_items()),
total_items_count=total_items_count,
current_items_first_index=current_items_first_index,
current_items_last_index=current_items_last_index,
current_items_count=current_items_count,
total_pages=total_pages,
current_page=page,
)
[docs] async def fetch_detail(self, code: str, year: int = 2022) -> Details:
"""Fetch details of a course.
Parameters
----------
code : str
Course (common) code.
year : int, optional
Year of the course, by default 2022.
Returns
-------
Details
Details of the course.
Raises
------
ParserError
Raises when the parser fails to parse the website.
"""
self._check_client()
assert self.session
await self._rate_limitter.wait()
async with self.session.get(
BASE_URL + "detail", params={"code": code, "year": str(year)}
) as response:
"""
We get information from 3 different types of elements:
cells 1: cells in the smallest table in the page.
cells 2: cells in the first card.
cards: cards.
"""
# parse html
soup = BeautifulSoup(await response.text(), "html.parser")
# utility functions to get elements and their text
cells1_parent: Tag = soup.find_all(class_="catalog-row")[1]
def get_cell1(name: str) -> str:
class_ = f"{name}-cell"
cell = cells1_parent.find("div", class_=class_)
if not cell:
raise ParserError(f"Cell {name} not found")
return _format(cell.text)
def get_cell2(index: int) -> str:
class_ = f"td{index // 3 + 1}-cell"
return _format(soup.find_all(class_=class_)[index % 3].text)
def get_cards():
cards: ResultSet[Tag] = soup.find_all(class_="catalog-page-detail-card")
for card in cards:
card_header = card.find(class_="catalog-page-detail-card-header")
if not card_header:
raise ParserError("Card header not found")
title = _format(card_header.text)
card_body = card.find(class_="catalog-page-detail-card-body-pre")
if not card_body:
raise ParserError("card_body not found")
if type(card_body) is not Tag:
raise ParserError("card_body is not Tag")
yield title, card_body
cards = dict(get_cards())
def get_card(name: str) -> Optional[Tag]:
return cards.get(name, None)
def get_card_text(name: str) -> Optional[str]:
card = get_card(name)
if card:
return _format_description(card.text)
return None
code_cell = _ensure_found(cells1_parent.find(class_="code-cell"))
code_cell_children = list(code_cell.children)
# return the result
return Details(
時間割コード=code_cell_children[1].text,
共通科目コード=CommonCode(code_cell_children[3].text),
コース名=get_cell1("name"),
教員=get_cell1("lecturer"),
学期=set(
[
Semester(el.text.replace(" ", "").replace("\n", ""))
for el in cells1_parent.find_all(class_="catalog-semester-icon")
]
),
曜限=_parse_weekday_period(get_cell1("period")),
教室=get_cell2(0),
単位数=Decimal(get_cell2(1)),
他学部履修可="不可" not in get_cell2(2),
講義使用言語=get_cell2(3),
実務経験のある教員による授業科目="YES" in get_cell2(4),
開講所属=Faculty.value_of(get_cell2(5)),
授業計画=get_card_text("授業計画"),
授業の方法=get_card_text("授業の方法"),
成績評価方法=get_card_text("成績評価方法"),
教科書=get_card_text("教科書"),
参考書=get_card_text("参考書"),
履修上の注意=get_card_text("履修上の注意"),
ねらい=_format(
_ensure_found(
soup.find(class_="catalog-page-detail-lecture-aim")
).text
),
)
[docs] async def fetch_common_code(self, 時間割コード: str) -> CommonCode:
"""Fetch common code of a course from its time table code.
Returns
-------
CommonCode
Common code of the course
"""
result = await self.fetch_search(SearchParams(keyword=時間割コード))
return result.items[0].共通科目コード
[docs] async def fetch_code(self, 共通科目コード: str) -> str:
"""Fetch time table code of a course from its common code.
Returns
-------
str
Time table code of the course
"""
result = await self.fetch_search(SearchParams(keyword=共通科目コード))
return result.items[0].時間割コード
[docs] def retry(self, func: WrappedFn) -> WrappedFn:
return retry(
stop=(stop_after_delay(10) | stop_after_attempt(3)),
wait=wait_exponential(multiplier=1, min=4, max=16),
before_sleep=before_sleep_log(self._logger, 30),
)(func)
[docs] async def fetch_search_all(
self,
params: SearchParams,
*,
use_tqdm: bool = True,
on_initial_request: Optional[
Callable[[SearchResult], Optional[Awaitable]]
] = None,
) -> AsyncIterable[SearchResultItem]:
"""Fetch all search results by repeatedly calling `fetch_search`.
Parameters
----------
params : SearchParams
Search parameters
use_tqdm : bool, optional
Whether to use tqdm, by default True
on_initial_request : Optional[Callable[[SearchResult], Optional[Awaitable]]], optional
Callback function to be called on the initial request, by default None
Returns
-------
AsyncIterable[SearchResultItem]
Async iterable of search results
Yields
------
Iterator[AsyncIterable[SearchResultItem]]
Async iterable of search results
"""
pbar = tqdm(disable=not use_tqdm)
result = await self.fetch_search(params)
pbar.update()
if on_initial_request:
await _await_if_future(on_initial_request(result))
for item in result.items:
yield item
pbar.total = result.total_pages
tasks = []
for page in range(2, result.total_pages + 1):
async def inner(page):
try:
search = await self.retry(self.fetch_search)(params, page)
except:
self._logger.error(f"Failed to fetch page {page}")
return None
pbar.update(1)
return search
result_task = create_task(inner(page))
tasks.append(result_task)
results = await asyncio.gather(*tasks)
for result in results:
if result:
for item in result.items:
yield item
[docs] async def fetch_search_detail_all(
self,
params: SearchParams,
*,
year: int = 2022,
use_tqdm: bool = True,
on_initial_request: Optional[
Callable[[SearchResult], Optional[Awaitable]]
] = None,
on_detail_request: Optional[Callable[[Details], Optional[Awaitable]]] = None,
) -> Iterable[Details]:
"""Fetch all search results by repeatedly calling `fetch_search` and `fetch_detail`.
Parameters
----------
params : SearchParams
Search parameters
year : int, optional
Year of the course, by default 2022
use_tqdm : bool, optional
Whether to use tqdm, by default True
on_initial_request : Optional[Callable[[SearchResult], Optional[Awaitable]]], optional
Callback function to be called on the initial request, by default None
Returns
-------
AsyncIterable[Details]
Async iterable of details
Yields
------
Iterator[AsyncIterable[Details]]
Async iterable of details
"""
pbar = tqdm(disable=not use_tqdm)
async def on_initial_request_wrapper(search_result: SearchResult):
pbar.total = search_result.total_items_count
if on_initial_request:
await _await_if_future(on_initial_request(search_result))
tasks = []
items = [
item
async for item in self.fetch_search_all(
params,
use_tqdm=True,
on_initial_request=on_initial_request_wrapper,
)
]
s = asyncio.Semaphore(100)
for item in items:
async def inner(item):
async with s:
try:
details = await self.retry(self.fetch_detail)(item.時間割コード, year)
except Exception as e:
self._logger.error(e)
return None
pbar.update()
if on_detail_request:
await _await_if_future(on_detail_request(details))
return details
detail_task = create_task(inner(item))
tasks.append(detail_task)
results = await asyncio.gather(*tasks)
return results
[docs] def get_filepath(self, params: SearchParams, filename: Optional[str]) -> Path:
if not filename:
filename = params.id()
if not filename.endswith(".pkl"):
filename += ".pkl"
filepath = Path(filename)
filepath.parent.mkdir(parents=True, exist_ok=True)
return filepath
[docs] async def fetch_and_save_search_detail_all(
self,
params: SearchParams,
*,
year: int = 2022,
filename: Optional[str] = None,
use_tqdm: bool = True,
on_initial_request: Optional[
Callable[[SearchResult], Optional[Awaitable]]
] = None,
) -> Iterable[Details]:
"""Fetch all search results by repeatedly calling `fetch_search` and `fetch_detail` and save them to a PKL file.
The filename is params.id() + ".pkl" if not specified.
Parameters
----------
params : SearchParams
Search parameters
year : int, optional
Year of the course, by default 2022
filename : Optional[str], optional
Filename to save the results, by default None. If None, the filename is params.id() + ".pkl".
use_tqdm : bool, optional
Whether to use tqdm, by default True
on_initial_request : Optional[Callable[[SearchResult], Optional[Awaitable]]], optional
Callback function to be called on the initial request, by default None
Returns
-------
AsyncIterable[Details]
Async iterable of details
Yields
------
Iterator[AsyncIterable[Details]]
Async iterable of details
"""
filepath = self.get_filepath(params, filename)
self._logger.info(f"Saving to {filepath}")
result = await self.fetch_search_detail_all(
params,
year=year,
use_tqdm=use_tqdm,
on_initial_request=on_initial_request,
)
try:
async with aiofiles.open(filepath, "wb") as f:
await f.write(pickle.dumps(result))
except Exception as e:
self._logger.error(e)
self._logger.error(f"Skipping saving to {filepath}")
return result
[docs] async def fetch_and_save_search_detail_all_pandas(
self,
params: SearchParams,
*,
year: int = 2022,
filename: Optional[str] = None,
use_tqdm: bool = True,
on_initial_request: Optional[
Callable[[SearchResult], Optional[Awaitable]]
] = None,
) -> DataFrame:
data = await self.fetch_and_save_search_detail_all(
params,
year=year,
use_tqdm=use_tqdm,
on_initial_request=on_initial_request,
filename=filename,
)
try:
from .pandas import to_dataframe
df = to_dataframe(data)
except Exception as e:
self._logger.error(e)
self._logger.error("Returning raw data instead of pandas dataframe.")
return data # type: ignore
try:
filepath = self.get_filepath(params, filename)
filepath = filepath.with_suffix(".pandas.pkl")
self._logger.info(f"Saving to {filepath}")
df.to_pickle(filepath.absolute())
except Exception as e:
self._logger.error(e)
self._logger.error(f"Skipping saving to {filename}")
return df