{title}
本书籍由MrDoc(mrdoc.zmister.com)生成
# coding:utf-8
# @文件: report_utils.py
# @创建者:州的先生
# #日期:2019/12/7
# 博客地址:zmister.com
# MrDoc文集文档导出相关功能代码
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.utils.translation import gettext_lazy as _
from bs4 import BeautifulSoup
import subprocess
import datetime,time
import re
import os,sys
import shutil
from django.core.wsgi import get_wsgi_application
sys.path.extend([settings.BASE_DIR])
os.environ.setdefault("DJANGO_SETTINGS_MODULE","MrDoc.settings")
application = get_wsgi_application()
import django
django.setup()
from app_doc.models import *
from subprocess import Popen
from loguru import logger
from app_doc.report_html2pdf import convert
import traceback
import time
import markdown
import yaml
# import PyPDF2
# from pdfminer import high_level
# 替换前端传来的非法字符
def validate_title(title):
rstr = r"[\/\\\:\*\?\"\<\>\|\[\]]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
# 导出MD文件压缩包
@logger.catch()
class ReportMD():
def __init__(self,project_id):
# 查询文集信息
self.pro_id = project_id
self.project_data = Project.objects.get(pk=project_id)
# 文集名称
self.project_name = "{0}_{1}_{2}".format(
self.project_data.create_user,
validate_title(self.project_data.name),
str(datetime.date.today())
)
# 判断MD导出临时文件夹是否存在
if os.path.exists(settings.MEDIA_ROOT + "/reportmd_temp") is False:
os.mkdir(settings.MEDIA_ROOT + "/reportmd_temp")
# 判断文集名称文件夹是否存在
self.project_path = settings.MEDIA_ROOT + "/reportmd_temp/{}".format(self.project_name)
is_fold = os.path.exists(self.project_path)
if is_fold is False:
os.mkdir(self.project_path)
# 判断是否存在静态文件文件夹
self.media_path = settings.MEDIA_ROOT + "/reportmd_temp/{}/media".format(self.project_name)
is_media = os.path.exists(self.media_path)
if is_media is False:
os.mkdir(self.media_path)
def work(self):
# 初始化文集YAML数据
project_toc_list = {}
project_toc_list['project_name'] = validate_title(self.project_data.name)
project_toc_list['project_desc'] = self.project_data.intro
project_toc_list['project_role'] = self.project_data.role
project_toc_list['toc'] = []
# 读取指定文集的文档数据
data = Doc.objects.filter(top_doc=self.pro_id, parent_doc=0).order_by("sort")
# 遍历一级文档
for d in data:
top_item = {
'name': validate_title(d.name),
'file': validate_title(d.name)+'.md',
}
md_name = validate_title(d.name) # 文档名称
# 文档内容,如果使用Markdown编辑器编写则导出Markdown文本,如果使用富文本编辑器编写则导出HTML文本
md_content = self.operat_md_media(d.pre_content) \
if d.editor_mode in [1,2] else self.operat_md_media(d.content)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path,md_name),'w',encoding='utf-8') as files:
files.write(md_content)
# 查询二级文档
data_2 = Doc.objects.filter(parent_doc=d.id).order_by("sort")
if data_2.count() > 0:
top_item['children'] = []
for d2 in data_2:
sec_item = {
'name': validate_title(d2.name),
'file': validate_title(d2.name)+'.md',
}
md_name_2 = validate_title(d2.name)
md_content_2 = self.operat_md_media(d2.pre_content) \
if d2.editor_mode in [1,2] else self.operat_md_media(d2.content)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path, md_name_2), 'w', encoding='utf-8') as files:
files.write(md_content_2)
# 获取第三级文档
data_3 = Doc.objects.filter(parent_doc=d2.id).order_by("sort")
if data_3.count() > 0:
sec_item['children'] = []
for d3 in data_3:
item = {
'name': validate_title(d3.name),
'file': validate_title(d3.name)+'.md',
}
sec_item['children'].append(item)
md_name_3 = validate_title(d3.name)
md_content_3 = self.operat_md_media(d3.pre_content) \
if d3.editor_mode in [1,2] else self.operat_md_media(d3.content)
# 新建MD文件
with open('{}/{}.md'.format(self.project_path, md_name_3), 'w', encoding='utf-8') as files:
files.write(md_content_3)
top_item['children'].append(sec_item)
project_toc_list['toc'].append(top_item)
# 写入层级YAML
with open('{}/mrdoc.yaml'.format(self.project_path), 'a+', encoding='utf-8') as toc_yaml:
yaml.dump(project_toc_list,toc_yaml,allow_unicode=True)
# 压缩文件
md_file = shutil.make_archive(
base_name=self.project_path,
format='zip',
root_dir=self.project_path
)
# print(md_file)
# 删除文件夹
shutil.rmtree(self.project_path)
return "{}.zip".format(self.project_path)
# 处理MD内容中的静态文件
def operat_md_media(self,md_content):
# 查找MD内容中的静态文件
pattern = r"\!\[.*?\]\(.*?\)"
media_list = re.findall(pattern, md_content)
# print(media_list)
# 存在静态文件,进行遍历
if len(media_list) > 0:
for media in media_list:
media_filename = media.replace('//','/').split("(")[-1].split(")")[0] # 媒体文件的文件名
# 对本地静态文件进行复制
if media_filename.startswith("/media"):
# print(media_filename)
sub_folder = "/" + media_filename.split("/")[2] # 获取子文件夹的名称
# print(sub_folder)
is_sub_folder = os.path.exists(self.media_path+sub_folder)
# 创建子文件夹
if is_sub_folder is False:
os.mkdir(self.media_path+sub_folder)
# 替换MD内容的静态文件链接
md_content = md_content.replace(media_filename, "." + media_filename)
# 复制静态文件到指定文件夹
try:
shutil.copy(settings.BASE_DIR + media_filename, self.media_path+sub_folder)
except FileNotFoundError:
pass
return md_content
# 不存在静态文件,直接返回MD内容
else:
return md_content
# 批量导出文集Markdown压缩包
class ReportMdBatch():
def __init__(self,username,project_id_list):
self.project_list = project_id_list
self.username = username
# 判断MD导出临时文件夹是否存在
if os.path.exists(settings.MEDIA_ROOT + "/reportmd_temp") is False:
os.mkdir(settings.MEDIA_ROOT + "/reportmd_temp")
# 判断用户名+日期文件夹是否存在
self.report_file_path = settings.MEDIA_ROOT + "/reportmd_temp/{}_{}".format(
self.username,datetime.datetime.strftime(datetime.datetime.now(),"%y%m%d%H%M%S")
)
is_fold = os.path.exists(self.report_file_path)
if is_fold is False:
os.mkdir(self.report_file_path)
def work(self):
# 遍历文集列表,打包每一个文集
project_file_list = []
for project_id in self.project_list:
report_func = ReportMD(project_id=project_id)
report_project_zip = report_func.work()
project_file_list.append(report_project_zip)
# 遍历打包好的文集列表,将其移入统一文件夹
for file in project_file_list:
shutil.move(file,self.report_file_path)
# 压缩打包文集合集文件夹
md_file = shutil.make_archive(
base_name=self.report_file_path,
format='zip',
root_dir=self.report_file_path
)
# print(md_file)
# 删除文件夹
shutil.rmtree(self.report_file_path)
return "{}.zip".format(self.report_file_path)
# 导出EPUB
@logger.catch()
class ReportEPUB():
def __init__(self,project_id):
self.project = Project.objects.get(id=project_id)
self.base_path = settings.MEDIA_ROOT + '/report_epub/{}/'.format(project_id)
# 创建相关目录
if os.path.exists(self.base_path + '/OEBPS') is False:
os.makedirs(self.base_path + '/OEBPS')
if os.path.exists(self.base_path + '/OEBPS/Images') is False:
os.makedirs(self.base_path + '/OEBPS/Images')
if os.path.exists(self.base_path + '/OEBPS/Text') is False:
os.makedirs(self.base_path + '/OEBPS/Text')
if os.path.exists(self.base_path + '/OEBPS/Styles') is False:
os.makedirs(self.base_path + '/OEBPS/Styles')
if os.path.exists(self.base_path + '/META-INF') is False:
os.makedirs(self.base_path + '/META-INF')
# 复制样式文件到相关目录
shutil.copyfile(settings.BASE_DIR+'/static/report_epub/style.css',self.base_path + '/OEBPS/Styles/style.css')
# shutil.copyfile(settings.BASE_DIR+'/static/katex/katex.min.css',self.base_path + '/OEBPS/Styles/katex.css')
shutil.copyfile(settings.BASE_DIR+'/static/editor.md/css/editormd.min.css',self.base_path + '/OEBPS/Styles/editormd.css')
# 复制封面图片到相关目录
shutil.copyfile(settings.BASE_DIR+'/static/report_epub/epub_cover1.jpg',self.base_path + '/OEBPS/Images/epub_cover1.jpg')
# 将文档内容写入HTML文件
def write_html(self, d, html_str):
# 使用BeautifulSoup解析拼接好的HTML文本
html_soup = BeautifulSoup(html_str, 'lxml')
src_tag = html_soup.find_all(lambda tag: tag.has_attr("src")) # 查找所有包含src的标签
mindmap_tag = html_soup.select('svg.mindmap') # 查找所有脑图的SVG标签
tex_tag = html_soup.select('.editormd-tex') # 查找所有公式标签
flowchart_tag = html_soup.select('.flowchart') # 查找所有流程图标签
seque_tag = html_soup.select('.sequence-diagram') # 查找所有时序图标签
echart_tag = html_soup.select('.echart') # 查找所有echart图表标签
code_tag = html_soup.find_all(name="code") # 查找code代码标签
iframe_tag = html_soup.find_all(name='iframe') # 查找iframe
# 添加css样式标签
style_link = html_soup.new_tag(name='link',href="../Styles/style.css",rel="stylesheet",type="text/css")
html_soup.body.insert_before(style_link)
editormd_link = html_soup.new_tag(name='link',href='../Styles/editormd.css',rel="stylesheet",type="text/css")
html_soup.body.insert_before(editormd_link)
# 添加html标签的xmlns属性
html_soup.html['xmlns'] = "http://www.w3.org/1999/xhtml"
# 替换iframe视频为视频URL链接文本
for iframe in iframe_tag:
iframe_src = iframe.get('src')
iframe.name = 'p'
iframe.string = _("本格式不支持iframe视频显示,视频地址为:{}".format(iframe_src))
# 替换HTML文本中静态文件的相对链接为绝对链接
for src in src_tag:
if src['src'].startswith("/"):
src_path = src['src'] # 媒体文件原始路径
src_filename = src['src'].split("/")[-1] # 媒体文件名
src['src'] = '../Images/' + src_filename # 媒体文件在EPUB中的路径
# 复制文件到epub的Images文件夹
try:
shutil.copyfile(
src= settings.BASE_DIR + src_path,
dst= self.base_path + '/OEBPS/Images/' + src_filename
)
except FileNotFoundError as e:
pass
# 创建写入临时HTML文件
temp_file_path = self.base_path + '/OEBPS/Text/{0}.xhtml'.format(d.id)
with open(temp_file_path, 'a+', encoding='utf-8') as htmlfile:
htmlfile.write('' + str(html_soup))
# 生成文档HTML
def generate_html(self):
# 查询文档
data = Doc.objects.filter(top_doc=self.project.id, parent_doc=0, status=1).order_by("sort")
self.toc_list = [
{
'id': 0,
'link': 'Text/toc_summary.xhtml',
'pid': 0,
'title': _('目录')
}
]
nav_str = ''''''
nav_num = 1
# content.opf相关
manifest = '''
"
else:
toc_summary_str += ""
nav_str += '{}
".format(d.name)
if d.content is None:
d.content = markdown.markdown(
d.pre_content,
extensions=['markdown.extensions.fenced_code','markdown.extensions.tables']
)
html_str += d.content
self.write_html(d=d,html_str=html_str) # 生成HTML
# 生成HTML的目录位置
toc = {
'id':d.id,
'link':'{}.xhtml'.format(d.id),
'pid':d.parent_doc,
'title':d.name
}
self.toc_list.append(toc)
# nav
toc_nav = ''''
for d2 in data_2:
html_str = "
{}
".format(d2.name)
if d2.content is None:
d2.content = markdown.markdown(
d2.pre_content,
extensions=['markdown.extensions.fenced_code', 'markdown.extensions.tables']
)
html_str += d2.content
self.write_html(d=d2,html_str=html_str)
# 生成HTML的目录位置
toc = {
'id': d2.id,
'link': '{}.xhtml'.format(d2.id),
'pid': d2.parent_doc,
'title': d2.name
}
self.toc_list.append(toc)
toc_nav = ''''
for d3 in data_3:
html_str = "
{}
".format(d3.name)
# 如果文档没有HTML内容,将Markdown转换为HTML
if d3.content is None:
d3.content = markdown.markdown(
d3.pre_content,
extensions=['markdown.extensions.fenced_code', 'markdown.extensions.tables']
)
html_str += d3.content
self.write_html(d=d3,html_str=html_str)
# 生成HTML的目录位置
toc = {
'id': d3.id,
'link': '{}.xhtml'.format(d3.id),
'pid': d3.parent_doc,
'title': d3.name
}
self.toc_list.append(toc)
toc_nav = '''
书籍简介
{desc}
作者:{author}
日期:{create_time}