Files
CourseInformer/reader.py
2025-08-20 19:30:37 +08:00

248 lines
9.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Module for reading course schedule information from HTML file
"""
import os
import re
from dataclasses import dataclass
from typing import List
from bs4 import BeautifulSoup
@dataclass
class TimeRange:
"""Data class representing a time range with from and to values"""
from_value: int
to_value: int
@dataclass
class CourseSchedule:
"""Data class representing a course schedule entry"""
course_name: str
instructor: str
week: TimeRange # e.g., "1-8", "11-18", etc.
timeslot: TimeRange # e.g., "[1-2]", "[6-7]", "[8-10]", etc.
location: str # e.g., "[龙]二号楼2301"
day: str # e.g., "星期一", "星期二", etc.
period: str # e.g., "上午", "下午", "晚上"
@dataclass
class StudentInfo:
"""Data class representing student information"""
student_id: str
student_name: str
class_name: str
total_credits: float
def read_course_schedule(file_path: str = "resources/教学安排表.xls") -> tuple[StudentInfo, List[CourseSchedule]]:
"""
Read _course schedule information from HTML file
Args:
file_path: Path to the HTML file containing _course schedule
Returns:
Tuple of (StudentInfo, List of CourseSchedule objects)
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Course schedule file not found: {file_path}")
# Read the HTML file with GBK encoding
with open(file_path, 'r', encoding='GBK') as file:
content = file.read()
# Parse HTML content
soup = BeautifulSoup(content, 'html.parser')
# Extract student information
student_id = "Unknown"
student_name = "Unknown"
class_name = "Unknown"
total_credits = 0.0
# Get student info from hidden inputs
xh_input = soup.find('input', {'id': 'xh'})
if xh_input and xh_input.get('value'):
student_id = xh_input['value']
# Get student name and class from the first table
tables = soup.find_all('table')
if tables:
first_table = tables[0]
table_rows = first_table.find_all('tr')
for row in table_rows:
cells = row.find_all('td')
for cell in cells:
text = cell.get_text().strip()
if text.startswith("姓名:"):
student_name = text.replace("姓名:", "")
elif text.startswith("所在班级:"):
class_name = text.replace("所在班级:", "")
_student_info = StudentInfo(
student_id=student_id,
student_name=student_name,
class_name=class_name,
total_credits=total_credits
)
# Extract _course schedule information
_courses = []
# Get day names from the header row
days = ["星期一", "星期二", "星期三", "星期四", "星期五"] # Default days
header_row = soup.find('tr', class_='H')
if header_row:
day_cells = header_row.find_all('td', class_='td0')
days = []
for cell in day_cells:
day_text = cell.get_text().strip()
if day_text and "星期" in day_text: # Only add actual day names
days.append(day_text)
# Get _course information from the schedule table
course_divs = soup.find_all('div', class_='div1')
for div in course_divs:
# Extract _course information from each div
xkinfo = div.find('span', class_='xkinfo')
if xkinfo:
# Each div may contain multiple _courses
course_blocks = xkinfo.find_all('div', style=lambda x: x and 'padding-bottom:5px' in x)
for block in course_blocks:
# Get the raw text
raw_text = block.get_text()
# Initialize variables
course_name = "Unknown Course"
instructor = "Unknown"
location = "Unknown"
week_range = TimeRange(1, 1)
timeslot_range = TimeRange(1, 1)
# Parse the concatenated text based on the format:
# course_name + instructor + weeks[timeslot] + location
# e.g., "企业资源计划ERP黄伟 1-8[3-4][龙]一号楼1307"
# Look for the pattern: numbers followed by brackets (timeslot)
time_pattern = r'([0-9\-]+)(\[[0-9\-]+\])'
time_match = re.search(time_pattern, raw_text)
if time_match:
# Extract time information
weeks_str = time_match.group(1)
timeslot_str = time_match.group(2)
# Parse week range
week_range = TimeRange(1, 1)
week_range = time_parser(weeks_str,week_range)
# Parse timeslot range (remove brackets)
timeslot_range = TimeRange(1, 1)
timeslot_clean = timeslot_str.strip('[]')
timeslot_range = time_parser(timeslot_clean, timeslot_range)
# Extract location (everything after the time info)
time_end = time_match.end()
if time_end < len(raw_text):
location = raw_text[time_end:].strip()
# Extract the part before time info
time_start = time_match.start()
before_time = raw_text[:time_start].strip()
# Split the part before time to get _course name and instructor
# Look for Chinese characters to identify the instructor
instructor_pattern = r'([\u4e00-\u9fff]+)$'
instructor_match = re.search(instructor_pattern, before_time)
if instructor_match:
instructor = instructor_match.group(1)
# Course name is everything before the instructor
instructor_start = instructor_match.start()
course_name = before_time[:instructor_start].strip()
else:
# If we can't find instructor, use the whole part as _course name
course_name = before_time
else:
# Fallback: try to extract at least the _course name
# Assume the first part is the _course name
parts = raw_text.split()
if parts:
course_name = parts[0]
# Determine day and period based on div id
day = "Unknown"
period = "Unknown"
div_id = div.get('id', '')
if div_id.startswith('k') and len(div_id) >= 3:
# Extract day from div id (k11, k21, etc.)
# First digit after 'k' represents the day (1=Monday, 2=Tuesday, etc.)
try:
day_index = int(div_id[1]) - 1
if 0 <= day_index < len(days):
day = days[day_index]
except (ValueError, IndexError):
pass
# Extract period from div id (last digit represents the period)
# 1,2 = 上午, 3,4 = 下午, 5 = 晚上
try:
period_index = int(div_id[2])
if period_index in [1, 2]:
period = "上午"
elif period_index in [3, 4]:
period = "下午"
elif period_index == 5:
period = "晚上"
except (ValueError, IndexError):
pass
_course = CourseSchedule(
course_name=course_name,
instructor=instructor,
week=week_range,
timeslot=timeslot_range,
location=location,
day=day,
period=period
)
_courses.append(_course)
return _student_info, _courses
def time_parser(timeslot_clean, timeslot_range):
if '-' in timeslot_clean:
timeslot_parts = timeslot_clean.split('-')
if len(timeslot_parts) == 2:
try:
timeslot_range = TimeRange(int(timeslot_parts[0]), int(timeslot_parts[1]))
except ValueError:
pass
else:
try:
timeslot_value = int(timeslot_clean)
timeslot_range = TimeRange(timeslot_value, timeslot_value)
except ValueError:
pass
return timeslot_range
if __name__ == "__main__":
# Test the function
try:
student_info, courses = read_course_schedule()
print(f"Student: {student_info.student_name} ({student_info.student_id})")
print(f"Class: {student_info.class_name}")
print("\nCourses:")
for course in courses: # Print first 5 courses
print(f"- {course.course_name} by {course.instructor}")
print(f" Time: {course.week.from_value}-{course.week.to_value} [{course.timeslot.from_value}-{course.timeslot.to_value}]")
print(f" Location: {course.location}")
print(f" Day: {course.day}, Period: {course.period}")
print()
except Exception as e:
print(f"Error reading course schedule: {e}")