# coding:utf-8
# @文件: report_utils.py
# @创建者:州的先生
# #日期:2019/12/7
# 博客地址:zmister.com
# MrDoc文集文档导出相关功能代码
from django.conf import settings
import subprocess
import datetime,time
import re
import os,sys
import shutil
from bs4 import BeautifulSoup
from django.core.wsgi import get_wsgi_application
sys.path.extend([settings.BASE_DIR])
os.environ.setdefault("DJANGO_SETTINGS_MODULE","MrDoc.settings")
application = get_wsgi_application()
import django
django.setup()
from app_doc.models import *
import traceback
import time
from pyppeteer import launch
import asyncio
from loguru import logger
# import PyPDF2
# from pdfminer import high_level
# JS动态图形转静态图片
@logger.catch()
def geneta_js_img(html_path,img_path,types):
'''
:param html_path: HTML源文件路径
:param img_path: 保存的静态图片路径
:param type: 转换的类型,有mindmap、tex、flowchart、seque四种
:return:
'''
type_map = {
'mindmap':'.mindmap', # 脑图
'tex':'.editormd-tex', # 科学公式
'flowchart':'.flowchart', # 流程图
'seque':'.sequence-diagram', # 序列图
'echart':'.echart', # echart图表
}
async def main():
if settings.CHROMIUM_PATH:
browser = await launch(
executablePath=r'{}'.format(settings.CHROMIUM_PATH),
args=settings.CHROMIUM_ARGS,
headless=True,
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
else:
browser = await launch(
headless=True,
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
page = await browser.newPage()
await page.goto('file://' + html_path, {'waitUntil': 'networkidle0'})
element = await page.querySelector(type_map[types])
await element.screenshot({'type': 'jpeg', 'quality': 100, 'path': img_path})
await browser.close()
# asyncio.new_event_loop().run_until_complete(main())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
except:
loop.run_until_complete(main())
finally:
loop.close()
# HTML转PDF
@logger.catch()
def html_to_pdf(html_path,pdf_path):
async def main():
if settings.CHROMIUM_PATH:
browser = await launch(
executablePath=r'{}'.format(settings.CHROMIUM_PATH),
args=settings.CHROMIUM_ARGS,
headless=True,
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
else:
browser = await launch(
headless=True,
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
page = await browser.newPage()
await page.goto('file://' + html_path, {'waitUntil': 'networkidle0'})
await page.pdf({
'path':pdf_path,
'format':'A4',
'displayHeaderFooter':True,
'headerTemplate':'
',
'footerTemplate':'/
',
'margin':{
'top':'1cm',
'right':'1cm',
'bottom':'1cm',
'left':'1cm'
}
})
await browser.close()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
except:
loop.run_until_complete(main())
finally:
loop.close()
# 导出MD文件压缩包
@logger.catch()
class ReportMD():
def __init__(self,project_id):
# 查询文集信息
self.pro_id = project_id
project_data = Project.objects.get(pk=project_id)
# 文集名称
self.project_name = "{0}_{1}_{2}".format(
project_data.create_user,
project_data.name,
str(datetime.date.today())
)
# 判断MD导出临时文件夹是否存在
if os.path.exists(settings.MEDIA_ROOT + "/reportmd_temp") is False:
os.mkdir(settings.MEDIA_ROOT + "/reportmd_temp")
# 判断文集名称文件夹是否存在
self.project_path = settings.MEDIA_ROOT + "/reportmd_temp/{}".format(self.project_name)
is_fold = os.path.exists(self.project_path)
if is_fold is False:
os.mkdir(self.project_path)
# 判断是否存在静态文件文件夹
self.media_path = settings.MEDIA_ROOT + "/reportmd_temp/{}/media".format(self.project_name)
is_media = os.path.exists(self.media_path)
if is_media is False:
os.mkdir(self.media_path)
def work(self):
# 读取指定文集的文档数据
data = Doc.objects.filter(top_doc=self.pro_id, parent_doc=0).order_by("sort")
# 遍历一级文档
for d in data:
md_name = d.name
md_content = d.pre_content
md_content = self.operat_md_media(md_content)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path,md_name),'w',encoding='utf-8') as files:
files.write(md_content)
# 查询二级文档
data_2 = Doc.objects.filter(parent_doc=d.id).order_by("sort")
for d2 in data_2:
md_name_2 = d2.name
md_content_2 = d2.pre_content
md_content_2 = self.operat_md_media(md_content_2)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path, md_name_2), 'w', encoding='utf-8') as files:
files.write(md_content_2)
# 获取第三级文档
data_3 = Doc.objects.filter(parent_doc=d2.id).order_by("sort")
for d3 in data_3:
md_name_3 = d3.name
md_content_3 = d3.pre_content
md_content_3 = self.operat_md_media(md_content_3)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path, md_name_3), 'w', encoding='utf-8') as files:
files.write(md_content_3)
# 压缩文件
md_file = shutil.make_archive(
base_name=self.project_path,
format='zip',
root_dir=self.project_path
)
print(md_file)
# 删除文件夹
shutil.rmtree(self.project_path)
return "{}.zip".format(self.project_path)
# 处理MD内容中的静态文件
def operat_md_media(self,md_content):
# 查找MD内容中的静态文件
pattern = r"\!\[.*?\]\(.*?\)"
media_list = re.findall(pattern, md_content)
# print(media_list)
# 存在静态文件,进行遍历
if len(media_list) > 0:
for media in media_list:
media_filename = media.split("(")[-1].split(")")[0] # 媒体文件的文件名
# 对本地静态文件进行复制
if media_filename.startswith("/"):
sub_folder = "/" + media_filename.split("/")[3] # 获取子文件夹的名称
is_sub_folder = os.path.exists(self.media_path+sub_folder)
# 创建子文件夹
if is_sub_folder is False:
os.mkdir(self.media_path+sub_folder)
# 替换MD内容的静态文件链接
md_content = md_content.replace(media_filename, "." + media_filename)
# 复制静态文件到指定文件夹
try:
shutil.copy(settings.BASE_DIR + media_filename, self.media_path+sub_folder)
except FileNotFoundError:
pass
# 不存在本地静态文件,直接返回MD内容
# else:
# print("没有本地静态文件")
return md_content
# 不存在静态文件,直接返回MD内容
else:
return md_content
# 导出EPUB
@logger.catch()
class ReportEPUB():
def __init__(self,project_id):
self.project = Project.objects.get(id=project_id)
self.base_path = settings.MEDIA_ROOT + '/report_epub/{}/'.format(project_id)
# 创建相关目录
if os.path.exists(self.base_path + '/OEBPS') is False:
os.makedirs(self.base_path + '/OEBPS')
if os.path.exists(self.base_path + '/OEBPS/Images') is False:
os.makedirs(self.base_path + '/OEBPS/Images')
if os.path.exists(self.base_path + '/OEBPS/Text') is False:
os.makedirs(self.base_path + '/OEBPS/Text')
if os.path.exists(self.base_path + '/OEBPS/Styles') is False:
os.makedirs(self.base_path + '/OEBPS/Styles')
if os.path.exists(self.base_path + '/META-INF') is False:
os.makedirs(self.base_path + '/META-INF')
# 复制样式文件到相关目录
shutil.copyfile(settings.BASE_DIR+'/static/report_epub/style.css',self.base_path + '/OEBPS/Styles/style.css')
shutil.copyfile(settings.BASE_DIR+'/static/katex/katex.min.css',self.base_path + '/OEBPS/Styles/katex.css')
shutil.copyfile(settings.BASE_DIR+'/static/editor.md/css/editormd.min.css',self.base_path + '/OEBPS/Styles/editormd.css')
# 复制封面图片到相关目录
shutil.copyfile(settings.BASE_DIR+'/static/report_epub/epub_cover1.jpg',self.base_path + '/OEBPS/Images/epub_cover1.jpg')
# 将文档内容写入HTML文件
def write_html(self, d, html_str):
# 使用BeautifulSoup解析拼接好的HTML文本
html_soup = BeautifulSoup(html_str, 'lxml')
src_tag = html_soup.find_all(lambda tag: tag.has_attr("src")) # 查找所有包含src的标签
mindmap_tag = html_soup.select('svg.mindmap') # 查找所有脑图的SVG标签
tex_tag = html_soup.select('.editormd-tex') # 查找所有公式标签
flowchart_tag = html_soup.select('.flowchart') # 查找所有流程图标签
seque_tag = html_soup.select('.sequence-diagram') # 查找所有时序图标签
echart_tag = html_soup.select('.echart') # 查找所有echart图表标签
code_tag = html_soup.find_all(name="code") # 查找code代码标签
iframe_tag = html_soup.find_all(name='iframe') # 查找iframe
# 添加css样式标签
style_link = html_soup.new_tag(name='link',href="../Styles/style.css",rel="stylesheet",type="text/css")
katex_link = html_soup.new_tag(name='link',href='../Styles/katex.css',rel="stylesheet",type="text/css")
editormd_link = html_soup.new_tag(name='link',href='../Styles/editormd.css',rel="stylesheet",type="text/css")
html_soup.body.insert_before(style_link)
html_soup.body.insert_before(katex_link)
# html_soup.body.insert_before(editormd_link)
# 添加xlm标签声明
# html_soup.html.insert_before('')
# 添加html标签的xmlns属性
html_soup.html['xmlns'] = "http://www.w3.org/1999/xhtml"
# 替换iframe视频为视频URL链接文本
for iframe in iframe_tag:
iframe_src = iframe.get('src')
iframe.name = 'p'
iframe.string = "本格式不支持iframe视频显示,视频地址为:{}".format(iframe_src)
# 替换HTML文本中静态文件的相对链接为绝对链接
for src in src_tag:
if src['src'].startswith("/"):
src_path = src['src'] # 媒体文件原始路径
src_filename = src['src'].split("/")[-1] # 媒体文件名
src['src'] = '../Images/' + src_filename # 媒体文件在EPUB中的路径
# 复制文件到epub的Images文件夹
try:
shutil.copyfile(
src= settings.BASE_DIR + src_path,
dst= self.base_path + '/OEBPS/Images/' + src_filename
)
except FileNotFoundError as e:
pass
# 替换HTML文本中的脑图为静态图片
for mindmap in mindmap_tag:
# print('转换脑图')
html_str = '''
Markmap
{svg_content}
'''.format(svg_content=mindmap)
# 脑图HTML文件路径
temp_mindmap_html = settings.BASE_DIR +'/media/report_epub/mindmap_{}.html'.format(str(time.time()))
mindmap_img_filename = 'mindmap_{}.jpg'.format(str(time.time()))
mindmap_img_path = self.base_path + '/OEBPS/Images/' + mindmap_img_filename
# 写入临时HTML文件
with open(temp_mindmap_html,'w+',encoding='utf-8') as mindmap_html:
mindmap_html.write(html_str)
# 生成静态图片
geneta_js_img(temp_mindmap_html,mindmap_img_path,'mindmap')
# 将图片标签设置进去
mindmap.name = 'img'
mindmap['src'] = '../Images/' + mindmap_img_filename
mindmap.string = ''
os.remove(temp_mindmap_html) # 删除临时的HTML
# 替换公式为静态图片
for tex in tex_tag:
# print('转换公式')
html_str = '''
Markmap
{content}