From 54a63174b840f5ea9450e9a5ac62011516e39a50 Mon Sep 17 00:00:00 2001 From: Muhan Li Date: Mon, 26 Sep 2022 01:23:23 +0800 Subject: [PATCH] Update Python Script --- .editorconfig | 43 ++++++++++++++++++---------- README.md | 2 +- crawler.py | 79 ++++++++++++++++++++++++++------------------------- 3 files changed, 69 insertions(+), 55 deletions(-) diff --git a/.editorconfig b/.editorconfig index d49062e..89284ea 100755 --- a/.editorconfig +++ b/.editorconfig @@ -1,15 +1,28 @@ -root = true - -[*] -end_of_line = lf -insert_final_newline = true - -# Matches multiple files with brace expansion notation -[*.{py,java,cpp,go,js,html}] -charset = utf-8 -indent_style = tab -indent_size = 4 -trim_trailing_whitespace = true - -[*.md] -trim_trailing_whitespace = false +# top-most EditorConfig file +root = true + +[*] +end_of_line = lf +insert_final_newline = true + +# Matches multiple files with brace expansion notation +[{*.go,go.mod,go.sum}] +charset = utf-8 +indent_style = tab +indent_size = 4 +trim_trailing_whitespace = true + +[*.py] +charset = utf-8 +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true + +[*.{js,html,css,json,yml}] +charset = utf-8 +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true + +[*.{md,txt}] +trim_trailing_whitespace = false diff --git a/README.md b/README.md index cba70f9..d68a01c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ subscription link of public holidays in mainland China -> Calendar data updated at 1:01 on September 25, 2022 +> Calendar data updated at 1:37 on September 26, 2022 ## Demo diff --git a/crawler.py b/crawler.py index 9bde601..898c670 100644 --- a/crawler.py +++ b/crawler.py @@ -1,3 +1,5 @@ +"""从国务院官网抓取放假信息""" + import os import re from datetime import datetime, timezone, timedelta @@ -7,6 +9,7 @@ import requests def main(): + """更新节假日信息""" comments: list[str] = [ "// automatically generated by crawler.py", "// manually checked by DATA NOT VERIFIED", @@ -17,92 +20,90 @@ def main(): file = f'./data/{year}.txt' if os.path.isfile(file): - with open(file) as f: - existing = f.read() + with open(file, encoding='utf-8') as f_obj: + existing = f_obj.read() if comments[0] in existing and comments[1] not in existing: - continue + continue # 数据已人工确认 - with open(file, 'w') as f: - f.write( + with open(file, 'w', encoding='utf-8') as f_obj: + f_obj.write( f"{comments[0]} ({beijing_time().strftime('%-m/%-d/%Y')})\n" f"{comments[1]}\n// source: {link}\n\n{holidays}" ) + with open('./README.md', 'r', encoding='utf-8') as f_obj: + content = f_obj.read().split('\n') + update_info = "> Calendar data updated " - with open('./README.md', 'r') as f: - content = f.read().split('\n') - for i in range(len(content)): - if content[i].startswith(update_info): + for i, line in enumerate(content): + if line.startswith(update_info): content[i] = update_info + beijing_time().strftime("at %-H:%M on %B %-d, %Y") - with open('./README.md', 'w') as f: - f.write('\n'.join(content)) + + with open('./README.md', 'w', encoding='utf-8') as f_obj: + f_obj.write('\n'.join(content)) def data() -> Iterator[Tuple[str, str, str]]: + """爬取国务院网站数据""" for year, link in source(): print(f"\n\n{year}: {link}") results: list[str] = [] - r = requests.get(link) - r.encoding = r.apparent_encoding + response = requests.get(link, timeout=(5, 10)) + response.encoding = response.apparent_encoding line_regex = r"(?P.)、(?P.*):()?(?P.*放假.*。)" - for line in r.text.replace('
', '\n').split('\n'): - match = re.search(line_regex, line) - if match is None: - continue - - work, rest, *_ = match.group('detail').split('。') - dates = ';'.join((match.group('name'), parse(work), parse(rest))) - print(dates) # todo: 需要人工干预如下情况: 1.与周末连休, 2.补休 - results.append(f"{dates:30} // {match.group('detail')}") + for line in response.text.replace('
', '\n').split('\n'): + if match := re.search(line_regex, line): + work, rest, *_ = match.group('detail').split('。') + dates = ';'.join((match.group('name'), parse(work), parse(rest))) + print(dates) # 已知需要人工干预如下情况: 1.与周末连休, 2.补休 + results.append(f"{dates:30} // {match.group('detail')}") yield year, link, '\n'.join(results) def parse(text: str) -> str: + """解析节假日安排数据""" results: list[str] = [] range_type_a = r"(?P\d?\d)月(?P\d?\d)日至(?P\d?\d)月(?P\d?\d)日" range_type_b = r"(?P\d?\d)月(?P\d?\d)日至(?P\d?\d)日" single_date = r"(?P\d?\d)月(?P\d?\d)日" for item in text.split('、'): - match = re.search(range_type_a, item) - if match is not None: - results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m2')}.{match.group('d2')}") + if match := re.search(range_type_a, item): + results.append(f"{match.group('m1')}.{match.group('d1')}-" + f"{match.group('m2')}.{match.group('d2')}") print(f"\tA: {results[-1]:15} {item}") - continue - match = re.search(range_type_b, item) - if match is not None: - results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m1')}.{match.group('d2')}") + elif match := re.search(range_type_b, item): + results.append(f"{match.group('m1')}.{match.group('d1')}-" + f"{match.group('m1')}.{match.group('d2')}") print(f"\tB: {results[-1]:15} {item}") - continue - match = re.search(single_date, item) - if match is not None: + elif match := re.search(single_date, item): results.append(f"{match.group('m1')}.{match.group('d1')}") print(f"\tS: {results[-1]:15} {item}") - continue - print(f"\tX: {'':15} {item}") + else: + print(f"\tX: {'':15} {item}") return ','.join(results) def source() -> Iterator[Tuple[str, str]]: + """获取官网发布通知列表""" search_url = "http://sousuo.gov.cn/s.htm?t=paper&advance=false&n=&codeYear=&codeCode=" \ "&searchfield=title&sort=&q=%E8%8A%82%E5%81%87%E6%97%A5%E5%AE%89%E6%8E%92" link_regex = r"href=['\"](?P.*?)['\"].*国务院办公厅关于(?P20\d\d)年.*通知" - for line in requests.get(search_url).text.split('\n'): - match = re.search(link_regex, line) - if match is None: - continue - yield match.group('year'), match.group('link') + for line in requests.get(search_url, timeout=(5, 10)).text.split('\n'): + if match := re.search(link_regex, line): + yield match.group('year'), match.group('link') def beijing_time() -> datetime: + """获取当前北京时间""" utc_time = datetime.utcnow().replace(tzinfo=timezone.utc) return utc_time.astimezone(timezone(timedelta(hours=8)))