mirror of
https://github.com/muhac/chinese-holidays-calendar.git
synced 2024-11-27 01:50:00 +08:00
Update Python Script
This commit is contained in:
parent
05f3fbc89c
commit
54a63174b8
@ -1,15 +1,28 @@
|
|||||||
root = true
|
# top-most EditorConfig file
|
||||||
|
root = true
|
||||||
[*]
|
|
||||||
end_of_line = lf
|
[*]
|
||||||
insert_final_newline = true
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
# Matches multiple files with brace expansion notation
|
|
||||||
[*.{py,java,cpp,go,js,html}]
|
# Matches multiple files with brace expansion notation
|
||||||
charset = utf-8
|
[{*.go,go.mod,go.sum}]
|
||||||
indent_style = tab
|
charset = utf-8
|
||||||
indent_size = 4
|
indent_style = tab
|
||||||
trim_trailing_whitespace = true
|
indent_size = 4
|
||||||
|
trim_trailing_whitespace = true
|
||||||
[*.md]
|
|
||||||
trim_trailing_whitespace = false
|
[*.py]
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
|
||||||
|
[*.{js,html,css,json,yml}]
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
|
||||||
|
[*.{md,txt}]
|
||||||
|
trim_trailing_whitespace = false
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
subscription link of public holidays in mainland China
|
subscription link of public holidays in mainland China
|
||||||
|
|
||||||
> Calendar data updated at 1:01 on September 25, 2022
|
> Calendar data updated at 1:37 on September 26, 2022
|
||||||
|
|
||||||
## Demo
|
## Demo
|
||||||
|
|
||||||
|
79
crawler.py
79
crawler.py
@ -1,3 +1,5 @@
|
|||||||
|
"""从国务院官网抓取放假信息"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
@ -7,6 +9,7 @@ import requests
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
"""更新节假日信息"""
|
||||||
comments: list[str] = [
|
comments: list[str] = [
|
||||||
"// automatically generated by crawler.py",
|
"// automatically generated by crawler.py",
|
||||||
"// manually checked by DATA NOT VERIFIED",
|
"// manually checked by DATA NOT VERIFIED",
|
||||||
@ -17,92 +20,90 @@ def main():
|
|||||||
file = f'./data/{year}.txt'
|
file = f'./data/{year}.txt'
|
||||||
|
|
||||||
if os.path.isfile(file):
|
if os.path.isfile(file):
|
||||||
with open(file) as f:
|
with open(file, encoding='utf-8') as f_obj:
|
||||||
existing = f.read()
|
existing = f_obj.read()
|
||||||
if comments[0] in existing and comments[1] not in existing:
|
if comments[0] in existing and comments[1] not in existing:
|
||||||
continue
|
continue # 数据已人工确认
|
||||||
|
|
||||||
with open(file, 'w') as f:
|
with open(file, 'w', encoding='utf-8') as f_obj:
|
||||||
f.write(
|
f_obj.write(
|
||||||
f"{comments[0]} ({beijing_time().strftime('%-m/%-d/%Y')})\n"
|
f"{comments[0]} ({beijing_time().strftime('%-m/%-d/%Y')})\n"
|
||||||
f"{comments[1]}\n// source: {link}\n\n{holidays}"
|
f"{comments[1]}\n// source: {link}\n\n{holidays}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with open('./README.md', 'r', encoding='utf-8') as f_obj:
|
||||||
|
content = f_obj.read().split('\n')
|
||||||
|
|
||||||
update_info = "> Calendar data updated "
|
update_info = "> Calendar data updated "
|
||||||
with open('./README.md', 'r') as f:
|
for i, line in enumerate(content):
|
||||||
content = f.read().split('\n')
|
if line.startswith(update_info):
|
||||||
for i in range(len(content)):
|
|
||||||
if content[i].startswith(update_info):
|
|
||||||
content[i] = update_info + beijing_time().strftime("at %-H:%M on %B %-d, %Y")
|
content[i] = update_info + beijing_time().strftime("at %-H:%M on %B %-d, %Y")
|
||||||
with open('./README.md', 'w') as f:
|
|
||||||
f.write('\n'.join(content))
|
with open('./README.md', 'w', encoding='utf-8') as f_obj:
|
||||||
|
f_obj.write('\n'.join(content))
|
||||||
|
|
||||||
|
|
||||||
def data() -> Iterator[Tuple[str, str, str]]:
|
def data() -> Iterator[Tuple[str, str, str]]:
|
||||||
|
"""爬取国务院网站数据"""
|
||||||
for year, link in source():
|
for year, link in source():
|
||||||
print(f"\n\n{year}: {link}")
|
print(f"\n\n{year}: {link}")
|
||||||
results: list[str] = []
|
results: list[str] = []
|
||||||
|
|
||||||
r = requests.get(link)
|
response = requests.get(link, timeout=(5, 10))
|
||||||
r.encoding = r.apparent_encoding
|
response.encoding = response.apparent_encoding
|
||||||
|
|
||||||
line_regex = r"(?P<id>.)、(?P<name>.*):(</.*?>)?(?P<detail>.*放假.*。)"
|
line_regex = r"(?P<id>.)、(?P<name>.*):(</.*?>)?(?P<detail>.*放假.*。)"
|
||||||
for line in r.text.replace('<br/>', '\n').split('\n'):
|
for line in response.text.replace('<br/>', '\n').split('\n'):
|
||||||
match = re.search(line_regex, line)
|
if match := re.search(line_regex, line):
|
||||||
if match is None:
|
work, rest, *_ = match.group('detail').split('。')
|
||||||
continue
|
dates = ';'.join((match.group('name'), parse(work), parse(rest)))
|
||||||
|
print(dates) # 已知需要人工干预如下情况: 1.与周末连休, 2.补休
|
||||||
work, rest, *_ = match.group('detail').split('。')
|
results.append(f"{dates:30} // {match.group('detail')}")
|
||||||
dates = ';'.join((match.group('name'), parse(work), parse(rest)))
|
|
||||||
print(dates) # todo: 需要人工干预如下情况: 1.与周末连休, 2.补休
|
|
||||||
results.append(f"{dates:30} // {match.group('detail')}")
|
|
||||||
|
|
||||||
yield year, link, '\n'.join(results)
|
yield year, link, '\n'.join(results)
|
||||||
|
|
||||||
|
|
||||||
def parse(text: str) -> str:
|
def parse(text: str) -> str:
|
||||||
|
"""解析节假日安排数据"""
|
||||||
results: list[str] = []
|
results: list[str] = []
|
||||||
range_type_a = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<m2>\d?\d)月(?P<d2>\d?\d)日"
|
range_type_a = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<m2>\d?\d)月(?P<d2>\d?\d)日"
|
||||||
range_type_b = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<d2>\d?\d)日"
|
range_type_b = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日至(?P<d2>\d?\d)日"
|
||||||
single_date = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日"
|
single_date = r"(?P<m1>\d?\d)月(?P<d1>\d?\d)日"
|
||||||
|
|
||||||
for item in text.split('、'):
|
for item in text.split('、'):
|
||||||
match = re.search(range_type_a, item)
|
if match := re.search(range_type_a, item):
|
||||||
if match is not None:
|
results.append(f"{match.group('m1')}.{match.group('d1')}-"
|
||||||
results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m2')}.{match.group('d2')}")
|
f"{match.group('m2')}.{match.group('d2')}")
|
||||||
print(f"\tA: {results[-1]:15} {item}")
|
print(f"\tA: {results[-1]:15} {item}")
|
||||||
continue
|
|
||||||
|
|
||||||
match = re.search(range_type_b, item)
|
elif match := re.search(range_type_b, item):
|
||||||
if match is not None:
|
results.append(f"{match.group('m1')}.{match.group('d1')}-"
|
||||||
results.append(f"{match.group('m1')}.{match.group('d1')}-{match.group('m1')}.{match.group('d2')}")
|
f"{match.group('m1')}.{match.group('d2')}")
|
||||||
print(f"\tB: {results[-1]:15} {item}")
|
print(f"\tB: {results[-1]:15} {item}")
|
||||||
continue
|
|
||||||
|
|
||||||
match = re.search(single_date, item)
|
elif match := re.search(single_date, item):
|
||||||
if match is not None:
|
|
||||||
results.append(f"{match.group('m1')}.{match.group('d1')}")
|
results.append(f"{match.group('m1')}.{match.group('d1')}")
|
||||||
print(f"\tS: {results[-1]:15} {item}")
|
print(f"\tS: {results[-1]:15} {item}")
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\tX: {'':15} {item}")
|
else:
|
||||||
|
print(f"\tX: {'':15} {item}")
|
||||||
|
|
||||||
return ','.join(results)
|
return ','.join(results)
|
||||||
|
|
||||||
|
|
||||||
def source() -> Iterator[Tuple[str, str]]:
|
def source() -> Iterator[Tuple[str, str]]:
|
||||||
|
"""获取官网发布通知列表"""
|
||||||
search_url = "http://sousuo.gov.cn/s.htm?t=paper&advance=false&n=&codeYear=&codeCode=" \
|
search_url = "http://sousuo.gov.cn/s.htm?t=paper&advance=false&n=&codeYear=&codeCode=" \
|
||||||
"&searchfield=title&sort=&q=%E8%8A%82%E5%81%87%E6%97%A5%E5%AE%89%E6%8E%92"
|
"&searchfield=title&sort=&q=%E8%8A%82%E5%81%87%E6%97%A5%E5%AE%89%E6%8E%92"
|
||||||
link_regex = r"href=['\"](?P<link>.*?)['\"].*国务院办公厅关于(?P<year>20\d\d)年.*通知"
|
link_regex = r"href=['\"](?P<link>.*?)['\"].*国务院办公厅关于(?P<year>20\d\d)年.*通知"
|
||||||
|
|
||||||
for line in requests.get(search_url).text.split('\n'):
|
for line in requests.get(search_url, timeout=(5, 10)).text.split('\n'):
|
||||||
match = re.search(link_regex, line)
|
if match := re.search(link_regex, line):
|
||||||
if match is None:
|
yield match.group('year'), match.group('link')
|
||||||
continue
|
|
||||||
yield match.group('year'), match.group('link')
|
|
||||||
|
|
||||||
|
|
||||||
def beijing_time() -> datetime:
|
def beijing_time() -> datetime:
|
||||||
|
"""获取当前北京时间"""
|
||||||
utc_time = datetime.utcnow().replace(tzinfo=timezone.utc)
|
utc_time = datetime.utcnow().replace(tzinfo=timezone.utc)
|
||||||
return utc_time.astimezone(timezone(timedelta(hours=8)))
|
return utc_time.astimezone(timezone(timedelta(hours=8)))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user