From 54a63174b840f5ea9450e9a5ac62011516e39a50 Mon Sep 17 00:00:00 2001
From: Muhan Li <limuhan@msn.com>
Date: Mon, 26 Sep 2022 01:23:23 +0800
Subject: [PATCH] Update Python Script

---
 .editorconfig | 43 ++++++++++++++++++----------
 README.md     |  2 +-
 crawler.py    | 79 ++++++++++++++++++++++++++-------------------------
 3 files changed, 69 insertions(+), 55 deletions(-)
diff --git a/.editorconfig b/.editorconfig
index d49062e..89284ea 100755
--- a/.editorconfig
+++ b/.editorconfig
@@ -1,15 +1,28 @@
-root = true
-
-[*]
-end_of_line = lf
-insert_final_newline = true
-
-# Matches multiple files with brace expansion notation
-[*.{py,java,cpp,go,js,html}]
-charset = utf-8
-indent_style = tab
-indent_size = 4
-trim_trailing_whitespace = true
-
-[*.md]
-trim_trailing_whitespace = false
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+
+# Matches multiple files with brace expansion notation
+[{*.go,go.mod,go.sum}]
+charset = utf-8
+indent_style = tab
+indent_size = 4
+trim_trailing_whitespace = true
+
+[*.py]
+charset = utf-8
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+
+[*.{js,html,css,json,yml}]
+charset = utf-8
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
+
+[*.{md,txt}]
+trim_trailing_whitespace = false
diff --git a/README.md b/README.md
index cba70f9..d68a01c 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 subscription link of public holidays in mainland China
 
-> Calendar data updated at 1:01 on September 25, 2022
+> Calendar data updated at 1:37 on September 26, 2022
 
 ## Demo
 
diff --git a/crawler.py b/crawler.py
index 9bde601..898c670 100644
--- a/crawler.py
+++ b/crawler.py
@@ -1,3 +1,5 @@
+"""从国务院官网抓取放假信息"""
+
 import os
 import re
 from datetime import datetime, timezone, timedelta
@@ -7,6 +9,7 @@ import requests
 
 
 def main():
+    """更新节假日信息"""
     comments: list[str] = [
         "// automatically generated by crawler.py",
         "// manually checked by DATA NOT VERIFIED",
@@ -17,92 +20,90 @@ def main():
         file = f'./data/{year}.txt'
 
         if os.path.isfile(file):
-            with open(file) as f:
-                existing = f.read()
+            with open(file, encoding='utf-8') as f_obj:
+                existing = f_obj.read()
                 if comments[0] in existing and comments[1] not in existing:
-                    continue
+                    continue  # 数据已人工确认
 
-        with open(file, 'w') as f:
-            f.write(
+        with open(file, 'w', encoding='utf-8') as f_obj:
+            f_obj.write(
                 f"{comments[0]} ({beijing_time().strftime('%-m/%-d/%Y')})\n"
                 f"{comments[1]}\n// source: {link}\n\n{holidays}"
             )
 
+    with open('./README.md', 'r', encoding='utf-8') as f_obj:
+        content = f_obj.read().split('\n')
+
     update_info = "> Calendar data updated "
-    with open('./README.md', 'r') as f:
-        content = f.read().split('\n')
-    for i in range(len(content)):
-        if content[i].startswith(update_info):
+    for i, line in enumerate(content):
+        if line.startswith(update_info):
             content[i] = update_info + beijing_time().strftime("at %-H:%M on %B %-d, %Y")
-    with open('./README.md', 'w') as f:
-        f.write('\n'.join(content))
+
+    with open('./README.md', 'w', encoding='utf-8') as f_obj:
+        f_obj.write('\n'.join(content))
 
 
 def data() -> Iterator[Tuple[str, str, str]]:
+    """爬取国务院网站数据"""
     for year, link in source():
         print(f"\n\n{year}: {link}")
         results: list[str] = []
 
-        r = requests.get(link)
-        r.encoding = r.apparent_encoding
+        response = requests.get(link, timeout=(5, 10))
+        response.encoding = response.apparent_encoding
 
         line_regex = r"(?P<id>.)、(?P<name>.*)：(</.*?>)?(?P<detail>.*放假.*。)"
-        for line in r.text.replace('<br/>', '\n').split('\n'):
-            match = re.search(line_regex, line)
-            if match is None:
-                continue
-
-            work, rest, *_ = match.group('detail').split('。')
-            dates = ';'.join((match.group('name'), parse(work), parse(rest)))
-            print(dates)  # todo: 需要人工干预如下情况: 1.与周末连休, 2.补休
-            results.append(f"{dates:30} // {match.group('detail')}")
+        for line in response.text.replace('<br/>', '\n').split('\n'):
+            if match := re.search(line_regex, line):
+                work, rest, *_ = match.group('detail').split('。')
+                dates = ';'.join((match.group('name'), parse(work), parse(rest)))
+                print(dates)  # 已知需要人工干预如下情况: 1.与周末连休, 2.补休
+                results.append(f"{dates:30} // {match.group('detail')}")
 
         yield year, link, '\n'.join(results)
 
 
 def parse(text: str) -> str:
+    """解析节假日安排数据"""
     results: list[str] = []
     range_type_a = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<m2>\d?\d)月(?P<d2>\d?\d)日"
     range_type_b = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<d2>\d?\d)日"
     single_date = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日"
 
     for item in text.split('、'):
-        match = re.search(range_type_a, item)
-        if match is not None:
-            results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m2')}.{match.group('d2')}")
+        if match := re.search(range_type_a, item):
+            results.append(f"{match.group('m1')}.{match.group('d1')}-"
+                           f"{match.group('m2')}.{match.group('d2')}")
             print(f"\tA: {results[-1]:15} {item}")
-            continue
 
-        match = re.search(range_type_b, item)
-        if match is not None:
-            results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m1')}.{match.group('d2')}")
+        elif match := re.search(range_type_b, item):
+            results.append(f"{match.group('m1')}.{match.group('d1')}-"
+                           f"{match.group('m1')}.{match.group('d2')}")
             print(f"\tB: {results[-1]:15} {item}")
-            continue
 
-        match = re.search(single_date, item)
-        if match is not None:
+        elif match := re.search(single_date, item):
             results.append(f"{match.group('m1')}.{match.group('d1')}")
             print(f"\tS: {results[-1]:15} {item}")
-            continue
 
-        print(f"\tX: {'':15} {item}")
+        else:
+            print(f"\tX: {'':15} {item}")
 
     return ','.join(results)
 
 
 def source() -> Iterator[Tuple[str, str]]:
+    """获取官网发布通知列表"""
     search_url = "http://sousuo.gov.cn/s.htm?t=paper&advance=false&n=&codeYear=&codeCode=" \
                  "&searchfield=title&sort=&q=%E8%8A%82%E5%81%87%E6%97%A5%E5%AE%89%E6%8E%92"
     link_regex = r"href=['\"](?P<link>.*?)['\"].*国务院办公厅关于(?P<year>20\d\d)年.*通知"
 
-    for line in requests.get(search_url).text.split('\n'):
-        match = re.search(link_regex, line)
-        if match is None:
-            continue
-        yield match.group('year'), match.group('link')
+    for line in requests.get(search_url, timeout=(5, 10)).text.split('\n'):
+        if match := re.search(link_regex, line):
+            yield match.group('year'), match.group('link')
 
 
 def beijing_time() -> datetime:
+    """获取当前北京时间"""
     utc_time = datetime.utcnow().replace(tzinfo=timezone.utc)
     return utc_time.astimezone(timezone(timedelta(hours=8)))