CourseInformer/reader.py

"""
Module for reading course schedule information from HTML file
"""

import os
import re
from dataclasses import dataclass
from typing import List

from bs4 import BeautifulSoup


@dataclass
class TimeRange:
    """Data class representing a time range with from and to values"""
    from_value: int
    to_value: int


@dataclass
class CourseSchedule:
    """Data class representing a course schedule entry"""
    course_name: str
    instructor: str
    week: TimeRange  # e.g., "1-8", "11-18", etc.
    timeslot: TimeRange  # e.g., "[1-2]", "[6-7]", "[8-10]", etc.
    location: str  # e.g., "[龙]二号楼2301"
    day: str  # e.g., "星期一", "星期二", etc.
    period: str  # e.g., "上午", "下午", "晚上"


@dataclass
class StudentInfo:
    """Data class representing student information"""
    student_id: str
    student_name: str
    class_name: str
    total_credits: float


def read_course_schedule(file_path: str = "resources/教学安排表.xls") -> tuple[StudentInfo, List[CourseSchedule]]:
    """
    Read _course schedule information from HTML file

    Args:
        file_path: Path to the HTML file containing _course schedule

    Returns:
        Tuple of (StudentInfo, List of CourseSchedule objects)
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Course schedule file not found: {file_path}")

    # Read the HTML file with GBK encoding
    with open(file_path, 'r', encoding='GBK') as file:
        content = file.read()

    # Parse HTML content
    soup = BeautifulSoup(content, 'html.parser')

    # Extract student information
    student_id = "Unknown"
    student_name = "Unknown"
    class_name = "Unknown"
    total_credits = 0.0

    # Get student info from hidden inputs
    xh_input = soup.find('input', {'id': 'xh'})
    if xh_input and xh_input.get('value'):
        student_id = xh_input['value']

    # Get student name and class from the first table
    tables = soup.find_all('table')
    if tables:
        first_table = tables[0]
        table_rows = first_table.find_all('tr')
        for row in table_rows:
            cells = row.find_all('td')
            for cell in cells:
                text = cell.get_text().strip()
                if text.startswith("姓名："):
                    student_name = text.replace("姓名：", "")
                elif text.startswith("所在班级："):
                    class_name = text.replace("所在班级：", "")

    _student_info = StudentInfo(
        student_id=student_id,
        student_name=student_name,
        class_name=class_name,
        total_credits=total_credits
    )

    # Extract _course schedule information
    _courses = []

    # Get day names from the header row
    days = ["星期一", "星期二", "星期三", "星期四", "星期五"]  # Default days
    header_row = soup.find('tr', class_='H')
    if header_row:
        day_cells = header_row.find_all('td', class_='td0')
        days = []
        for cell in day_cells:
            day_text = cell.get_text().strip()
            if day_text and "星期" in day_text:  # Only add actual day names
                days.append(day_text)

    # Get _course information from the schedule table
    course_divs = soup.find_all('div', class_='div1')
    for div in course_divs:
        # Extract _course information from each div
        xkinfo = div.find('span', class_='xkinfo')
        if xkinfo:
            # Each div may contain multiple _courses
            course_blocks = xkinfo.find_all('div', style=lambda x: x and 'padding-bottom:5px' in x)
            for block in course_blocks:
                # Get the raw text
                raw_text = block.get_text()

                # Initialize variables
                course_name = "Unknown Course"
                instructor = "Unknown"
                location = "Unknown"
                week_range = TimeRange(1, 1)
                timeslot_range = TimeRange(1, 1)

                # Parse the concatenated text based on the format:
                # course_name + instructor + weeks[timeslot] + location
                # e.g., "企业资源计划（ERP）黄伟 1-8[3-4][龙]一号楼1307"

                # Look for the pattern: numbers followed by brackets (timeslot)
                time_pattern = r'([0-9\-]+)(\[[0-9\-]+\])'
                time_match = re.search(time_pattern, raw_text)

                if time_match:
                    # Extract time information
                    weeks_str = time_match.group(1)
                    timeslot_str = time_match.group(2)

                    # Parse week range
                    week_range = TimeRange(1, 1)
                    week_range = time_parser(weeks_str,week_range)

                    # Parse timeslot range (remove brackets)
                    timeslot_range = TimeRange(1, 1)
                    timeslot_clean = timeslot_str.strip('[]')
                    timeslot_range = time_parser(timeslot_clean, timeslot_range)

                    # Extract location (everything after the time info)
                    time_end = time_match.end()
                    if time_end < len(raw_text):
                        location = raw_text[time_end:].strip()

                    # Extract the part before time info
                    time_start = time_match.start()
                    before_time = raw_text[:time_start].strip()

                    # Split the part before time to get _course name and instructor
                    # Look for Chinese characters to identify the instructor
                    instructor_pattern = r'([\u4e00-\u9fff]+)$'
                    instructor_match = re.search(instructor_pattern, before_time)
                    if instructor_match:
                        instructor = instructor_match.group(1)
                        # Course name is everything before the instructor
                        instructor_start = instructor_match.start()
                        course_name = before_time[:instructor_start].strip()
                    else:
                        # If we can't find instructor, use the whole part as _course name
                        course_name = before_time
                else:
                    # Fallback: try to extract at least the _course name
                    # Assume the first part is the _course name
                    parts = raw_text.split()
                    if parts:
                        course_name = parts[0]

                # Determine day and period based on div id
                day = "Unknown"
                period = "Unknown"
                div_id = div.get('id', '')
                if div_id.startswith('k') and len(div_id) >= 3:
                    # Extract day from div id (k11, k21, etc.)
                    # First digit after 'k' represents the day (1=Monday, 2=Tuesday, etc.)
                    try:
                        day_index = int(div_id[1]) - 1
                        if 0 <= day_index < len(days):
                            day = days[day_index]
                    except (ValueError, IndexError):
                        pass

                    # Extract period from div id (last digit represents the period)
                    # 1,2 = 上午, 3,4 = 下午, 5 = 晚上
                    try:
                        period_index = int(div_id[2])
                        if period_index in [1, 2]:
                            period = "上午"
                        elif period_index in [3, 4]:
                            period = "下午"
                        elif period_index == 5:
                            period = "晚上"
                    except (ValueError, IndexError):
                        pass

                _course = CourseSchedule(
                    course_name=course_name,
                    instructor=instructor,
                    week=week_range,
                    timeslot=timeslot_range,
                    location=location,
                    day=day,
                    period=period
                )
                _courses.append(_course)

    return _student_info, _courses


def time_parser(timeslot_clean, timeslot_range):
    if '-' in timeslot_clean:
        timeslot_parts = timeslot_clean.split('-')
        if len(timeslot_parts) == 2:
            try:
                timeslot_range = TimeRange(int(timeslot_parts[0]), int(timeslot_parts[1]))
            except ValueError:
                pass
    else:
        try:
            timeslot_value = int(timeslot_clean)
            timeslot_range = TimeRange(timeslot_value, timeslot_value)
        except ValueError:
            pass
    return timeslot_range


if __name__ == "__main__":
    # Test the function
    try:
        student_info, courses = read_course_schedule()
        print(f"Student: {student_info.student_name} ({student_info.student_id})")
        print(f"Class: {student_info.class_name}")
        print("\nCourses:")
        for course in courses:  # Print first 5 courses
            print(f"- {course.course_name} by {course.instructor}")
            print(f"  Time: {course.week.from_value}-{course.week.to_value} [{course.timeslot.from_value}-{course.timeslot.to_value}]")
            print(f"  Location: {course.location}")
            print(f"  Day: {course.day}, Period: {course.period}")
            print()
    except Exception as e:
        print(f"Error reading course schedule: {e}")