# coding:utf-8 # @文件: report_utils.py # @创建者:州的先生 # #日期:2019/12/7 # 博客地址:zmister.com # MrDoc文集文档导出相关功能代码 from django.conf import settings from django.core.exceptions import ObjectDoesNotExist from django.utils.translation import gettext_lazy as _ from bs4 import BeautifulSoup import subprocess import datetime,time import re import os,sys import shutil from django.core.wsgi import get_wsgi_application sys.path.extend([settings.BASE_DIR]) os.environ.setdefault("DJANGO_SETTINGS_MODULE","MrDoc.settings") application = get_wsgi_application() import django django.setup() from app_doc.models import * from subprocess import Popen from loguru import logger from app_doc.report_html2pdf import convert import traceback import time import markdown import yaml # import PyPDF2 # from pdfminer import high_level # 替换前端传来的非法字符 def validate_title(title): rstr = r"[\/\\\:\*\?\"\<\>\|\[\]]" # '/ \ : * ? " < > |' new_title = re.sub(rstr, "_", title) # 替换为下划线 return new_title # 导出MD文件压缩包 @logger.catch() class ReportMD(): def __init__(self,project_id): # 查询文集信息 self.pro_id = project_id self.project_data = Project.objects.get(pk=project_id) # 文集名称 self.project_name = "{0}_{1}_{2}".format( self.project_data.create_user, validate_title(self.project_data.name), str(datetime.date.today()) ) # 判断MD导出临时文件夹是否存在 if os.path.exists(settings.MEDIA_ROOT + "/reportmd_temp") is False: os.mkdir(settings.MEDIA_ROOT + "/reportmd_temp") # 判断文集名称文件夹是否存在 self.project_path = settings.MEDIA_ROOT + "/reportmd_temp/{}".format(self.project_name) is_fold = os.path.exists(self.project_path) if is_fold is False: os.mkdir(self.project_path) # 判断是否存在静态文件文件夹 self.media_path = settings.MEDIA_ROOT + "/reportmd_temp/{}/media".format(self.project_name) is_media = os.path.exists(self.media_path) if is_media is False: os.mkdir(self.media_path) def work(self): # 初始化文集YAML数据 project_toc_list = {} project_toc_list['project_name'] = validate_title(self.project_data.name) project_toc_list['project_desc'] = self.project_data.intro project_toc_list['project_role'] = self.project_data.role project_toc_list['toc'] = [] # 读取指定文集的文档数据 data = Doc.objects.filter(top_doc=self.pro_id, parent_doc=0).order_by("sort") # 遍历一级文档 for d in data: top_item = { 'name': validate_title(d.name), 'file': validate_title(d.name)+'.md', } md_name = validate_title(d.name) # 文档名称 # 文档内容,如果使用Markdown编辑器编写则导出Markdown文本,如果使用富文本编辑器编写则导出HTML文本 md_content = self.operat_md_media(d.pre_content) \ if d.editor_mode in [1,2] else self.operat_md_media(d.content) # 新建MD文件 with open('{}/{}.md'.format(self.project_path,md_name),'w',encoding='utf-8') as files: files.write(md_content) # 查询二级文档 data_2 = Doc.objects.filter(parent_doc=d.id).order_by("sort") if data_2.count() > 0: top_item['children'] = [] for d2 in data_2: sec_item = { 'name': validate_title(d2.name), 'file': validate_title(d2.name)+'.md', } md_name_2 = validate_title(d2.name) md_content_2 = self.operat_md_media(d2.pre_content) \ if d2.editor_mode in [1,2] else self.operat_md_media(d2.content) # 新建MD文件 with open('{}/{}.md'.format(self.project_path, md_name_2), 'w', encoding='utf-8') as files: files.write(md_content_2) # 获取第三级文档 data_3 = Doc.objects.filter(parent_doc=d2.id).order_by("sort") if data_3.count() > 0: sec_item['children'] = [] for d3 in data_3: item = { 'name': validate_title(d3.name), 'file': validate_title(d3.name)+'.md', } sec_item['children'].append(item) md_name_3 = validate_title(d3.name) md_content_3 = self.operat_md_media(d3.pre_content) \ if d3.editor_mode in [1,2] else self.operat_md_media(d3.content) # 新建MD文件 with open('{}/{}.md'.format(self.project_path, md_name_3), 'w', encoding='utf-8') as files: files.write(md_content_3) top_item['children'].append(sec_item) project_toc_list['toc'].append(top_item) # 写入层级YAML with open('{}/mrdoc.yaml'.format(self.project_path), 'a+', encoding='utf-8') as toc_yaml: yaml.dump(project_toc_list,toc_yaml,allow_unicode=True) # 压缩文件 md_file = shutil.make_archive( base_name=self.project_path, format='zip', root_dir=self.project_path ) # print(md_file) # 删除文件夹 shutil.rmtree(self.project_path) return "{}.zip".format(self.project_path) # 处理MD内容中的静态文件 def operat_md_media(self,md_content): # 查找MD内容中的静态文件 pattern = r"\!\[.*?\]\(.*?\)" media_list = re.findall(pattern, md_content) # print(media_list) # 存在静态文件,进行遍历 if len(media_list) > 0: for media in media_list: media_filename = media.replace('//','/').split("(")[-1].split(")")[0] # 媒体文件的文件名 # 对本地静态文件进行复制 if media_filename.startswith("/media"): # print(media_filename) sub_folder = "/" + media_filename.split("/")[2] # 获取子文件夹的名称 # print(sub_folder) is_sub_folder = os.path.exists(self.media_path+sub_folder) # 创建子文件夹 if is_sub_folder is False: os.mkdir(self.media_path+sub_folder) # 替换MD内容的静态文件链接 md_content = md_content.replace(media_filename, "." + media_filename) # 复制静态文件到指定文件夹 try: shutil.copy(settings.BASE_DIR + media_filename, self.media_path+sub_folder) except FileNotFoundError: pass return md_content # 不存在静态文件,直接返回MD内容 else: return md_content # 批量导出文集Markdown压缩包 class ReportMdBatch(): def __init__(self,username,project_id_list): self.project_list = project_id_list self.username = username # 判断MD导出临时文件夹是否存在 if os.path.exists(settings.MEDIA_ROOT + "/reportmd_temp") is False: os.mkdir(settings.MEDIA_ROOT + "/reportmd_temp") # 判断用户名+日期文件夹是否存在 self.report_file_path = settings.MEDIA_ROOT + "/reportmd_temp/{}_{}".format( self.username,datetime.datetime.strftime(datetime.datetime.now(),"%y%m%d%H%M%S") ) is_fold = os.path.exists(self.report_file_path) if is_fold is False: os.mkdir(self.report_file_path) def work(self): # 遍历文集列表,打包每一个文集 project_file_list = [] for project_id in self.project_list: report_func = ReportMD(project_id=project_id) report_project_zip = report_func.work() project_file_list.append(report_project_zip) # 遍历打包好的文集列表,将其移入统一文件夹 for file in project_file_list: shutil.move(file,self.report_file_path) # 压缩打包文集合集文件夹 md_file = shutil.make_archive( base_name=self.report_file_path, format='zip', root_dir=self.report_file_path ) # print(md_file) # 删除文件夹 shutil.rmtree(self.report_file_path) return "{}.zip".format(self.report_file_path) # 导出EPUB @logger.catch() class ReportEPUB(): def __init__(self,project_id): self.project = Project.objects.get(id=project_id) self.base_path = settings.MEDIA_ROOT + '/report_epub/{}/'.format(project_id) # 创建相关目录 if os.path.exists(self.base_path + '/OEBPS') is False: os.makedirs(self.base_path + '/OEBPS') if os.path.exists(self.base_path + '/OEBPS/Images') is False: os.makedirs(self.base_path + '/OEBPS/Images') if os.path.exists(self.base_path + '/OEBPS/Text') is False: os.makedirs(self.base_path + '/OEBPS/Text') if os.path.exists(self.base_path + '/OEBPS/Styles') is False: os.makedirs(self.base_path + '/OEBPS/Styles') if os.path.exists(self.base_path + '/META-INF') is False: os.makedirs(self.base_path + '/META-INF') # 复制样式文件到相关目录 shutil.copyfile(settings.BASE_DIR+'/static/report_epub/style.css',self.base_path + '/OEBPS/Styles/style.css') # shutil.copyfile(settings.BASE_DIR+'/static/katex/katex.min.css',self.base_path + '/OEBPS/Styles/katex.css') shutil.copyfile(settings.BASE_DIR+'/static/editor.md/css/editormd.min.css',self.base_path + '/OEBPS/Styles/editormd.css') # 复制封面图片到相关目录 shutil.copyfile(settings.BASE_DIR+'/static/report_epub/epub_cover1.jpg',self.base_path + '/OEBPS/Images/epub_cover1.jpg') # 将文档内容写入HTML文件 def write_html(self, d, html_str): # 使用BeautifulSoup解析拼接好的HTML文本 html_soup = BeautifulSoup(html_str, 'lxml') src_tag = html_soup.find_all(lambda tag: tag.has_attr("src")) # 查找所有包含src的标签 mindmap_tag = html_soup.select('svg.mindmap') # 查找所有脑图的SVG标签 tex_tag = html_soup.select('.editormd-tex') # 查找所有公式标签 flowchart_tag = html_soup.select('.flowchart') # 查找所有流程图标签 seque_tag = html_soup.select('.sequence-diagram') # 查找所有时序图标签 echart_tag = html_soup.select('.echart') # 查找所有echart图表标签 code_tag = html_soup.find_all(name="code") # 查找code代码标签 iframe_tag = html_soup.find_all(name='iframe') # 查找iframe # 添加css样式标签 style_link = html_soup.new_tag(name='link',href="../Styles/style.css",rel="stylesheet",type="text/css") html_soup.body.insert_before(style_link) editormd_link = html_soup.new_tag(name='link',href='../Styles/editormd.css',rel="stylesheet",type="text/css") html_soup.body.insert_before(editormd_link) # 添加html标签的xmlns属性 html_soup.html['xmlns'] = "http://www.w3.org/1999/xhtml" # 替换iframe视频为视频URL链接文本 for iframe in iframe_tag: iframe_src = iframe.get('src') iframe.name = 'p' iframe.string = _("本格式不支持iframe视频显示,视频地址为:{}".format(iframe_src)) # 替换HTML文本中静态文件的相对链接为绝对链接 for src in src_tag: if src['src'].startswith("/"): src_path = src['src'] # 媒体文件原始路径 src_filename = src['src'].split("/")[-1] # 媒体文件名 src['src'] = '../Images/' + src_filename # 媒体文件在EPUB中的路径 # 复制文件到epub的Images文件夹 try: shutil.copyfile( src= settings.BASE_DIR + src_path, dst= self.base_path + '/OEBPS/Images/' + src_filename ) except FileNotFoundError as e: pass # 创建写入临时HTML文件 temp_file_path = self.base_path + '/OEBPS/Text/{0}.xhtml'.format(d.id) with open(temp_file_path, 'a+', encoding='utf-8') as htmlfile: htmlfile.write('' + str(html_soup)) # 生成文档HTML def generate_html(self): # 查询文档 data = Doc.objects.filter(top_doc=self.project.id, parent_doc=0, status=1).order_by("sort") self.toc_list = [ { 'id': 0, 'link': 'Text/toc_summary.xhtml', 'pid': 0, 'title': _('目录') } ] nav_str = '''''' toc_summary_str = '''' # print(nav_str) # print(toc_summary_str) self.nav_str = nav_str self.toc_summary_str = toc_summary_str # self.config_json['toc'] = self.toc_list self.manifest = manifest self.spine = spine # 生成书籍标题的描述HTML文件 def generate_title_html(self): title_str = ''' 书籍标题

{title}

{author} 著


{create_time}

本书籍由MrDoc(mrdoc.zmister.com)生成

'''.format( title=self.project.name, author=self.project.create_user, create_time = time.strftime('%Y{y}%m{m}%d{d}').format(y='年',m='月',d='日') ) with open(self.base_path+'/OEBPS/Text/book_title.xhtml','a+',encoding='utf-8') as file: file.write(title_str) desc_str = ''' 简介

书籍简介

{desc}

'''.format(desc=self.project.intro) with open(self.base_path+'/OEBPS/Text/book_desc.xhtml','a+',encoding='utf-8') as file: file.write(desc_str) # 生成元信息container.xml文件 def generate_metainfo(self): xml = ''' ''' folder = self.base_path + '/META-INF' with open(folder+'/container.xml','a+',encoding='utf-8') as metafile: metafile.write(xml) # 生成元类型mimetype文件 def generate_metatype(self): with open(self.base_path+'/mimetype','a+',encoding='utf-8') as metatype: metatype.write('application/epub+zip') # 生成封面 def generate_cover(self): xml_str = ''' 封面
''' with open(self.base_path + '/OEBPS/Text/book_cover.xhtml','a+', encoding='utf-8') as cover: cover.write(xml_str) # 生成文档目录.ncx文件 def generate_toc_ncx(self): ncx = ''' {title} {nav_map} '''.format(title=self.project.name,nav_map=self.nav_str) with open(self.base_path+'/OEBPS/toc.ncx','a+',encoding='utf-8') as file: file.write(ncx) # 生成文档目录toc_summary.html文件 def generate_toc_html(self): summary = ''' 目录

目    录

%s ''' % (self.toc_summary_str) with open(self.base_path+'/OEBPS/Text/toc_summary.xhtml','a+',encoding='utf-8') as file: file.write(summary) # 生成content.opf文件 def generate_opf(self): content_info = ''' {title} zh {creator} urn:uuid:12345 MrDoc制作 {create_time} {desc} {manifest} {spine} ''' with open(self.base_path+'/OEBPS/content.opf','a+',encoding='utf-8') as file: file.write( content_info.format( title = self.project.name, creator = self.project.create_user, create_time = str(datetime.date.today()), desc=self.project.intro, manifest=self.manifest, spine = self.spine, ) ) # 生成epub文件 def generate_epub(self): try: # 生成ZIP压缩文件 zipfile_name = settings.MEDIA_ROOT + '/report_epub/{}'.format(self.project.name)+'_'+str(int(time.time())) zip_name = shutil.make_archive( base_name = zipfile_name, format='zip', root_dir= settings.MEDIA_ROOT + '/report_epub/{}'.format(self.project.id) ) # print(zip_name) # 修改zip压缩文件后缀为EPUB os.rename(zip_name,zipfile_name+'.epub') # 删除生成的临时文件夹 shutil.rmtree(self.base_path) return zipfile_name except Exception as e: if settings.DEBUG: print(traceback.print_exc()) return None def work(self): self.generate_html() # 生成HTML self.generate_metainfo() # 生成元信息 self.generate_metatype() # 生成元类型 self.generate_toc_ncx() # 生成目录ncx self.generate_toc_html() # 生成目录html self.generate_cover() # 生成封面html self.generate_title_html() # 生产书籍的标题页和简介页 self.generate_opf() # 生成content.opf epub_file = self.generate_epub() return epub_file # 导出PDF @logger.catch() class ReportPDF(): def __init__(self,project_id,user_id): # 查询文集信息 self.pro_id = project_id self.user_id = user_id self.editormd_html_str = ''' {title}
本文件由MrDoc觅道文档生成

{project_name}

作者:{author}

日期:{create_time}

\n
''' self.vditor_html_str = '''''' self.iceesitor_html_str = '''''' self.content_str = "" def work(self): try: user = User.objects.get(id=self.user_id) project = Project.objects.get(pk=self.pro_id,create_user=user) except ObjectDoesNotExist: logger.error("查询文集或用户失败") return False except: logger.exception("未知异常") return False # 拼接文档的HTML字符串 data = Doc.objects.filter(top_doc=self.pro_id,parent_doc=0,status=1).order_by("sort") toc_list = {'1':[],'2':[],'3':[]} for d in data: self.content_str += "

{}

\n\n".format(d.name) if d.editor_mode in [1,2]: self.content_str += d.pre_content + '\n' elif d.editor_mode == 3: self.content_str += d.content + '\n' toc_list['1'].append({'id':d.id,'name':d.name}) # 获取第二级文档 data_2 = Doc.objects.filter(parent_doc=d.id,status=1).order_by("sort") for d2 in data_2: self.content_str += "\n\n

{}

\n\n".format(d2.name) if d2.editor_mode in [1, 2]: self.content_str += d2.pre_content + '\n' elif d2.editor_mode == 3: self.content_str += d2.content + '\n' toc_list['2'].append({'id':d2.id,'name':d2.name,'parent':d.id}) # 获取第三级文档 data_3 = Doc.objects.filter(parent_doc=d2.id,status=1).order_by("sort") for d3 in data_3: # print(d3.name,d3.content) self.content_str += "\n\n

{}

\n\n".format(d3.name) if d3.editor_mode in [1, 2]: self.content_str += d3.pre_content + '\n' elif d3.editor_mode == 3: self.content_str += d3.content + '\n' toc_list['3'].append({'id':d3.id,'name':d3.name,'parent':d2.id}) # 替换所有媒体文件链接 self.content_str = self.content_str.replace('![](/media/','![](../../media/') # print(self.html_str.format(pre_content=self.content_str)) # 创建写入临时HTML文件 report_pdf_folder = settings.MEDIA_ROOT+'/report_pdf' is_folder = os.path.exists(report_pdf_folder) # 创建文件夹 if is_folder is False: os.mkdir(report_pdf_folder) # 临时HTML和PDF文件名 temp_file_name = '{}_{}'.format( project.name, str(datetime.datetime.today()).replace(' ', '-').replace(':', '-') ) # 临时HTML文件路径 temp_file_path = report_pdf_folder + '/{0}.html'.format(temp_file_name) # PDF文件路径 report_file_path = report_pdf_folder + '/{0}.pdf'.format(temp_file_name) # 写入HTML文件 with open(temp_file_path, 'w', encoding='utf-8') as htmlfile: htmlfile.write( self.editormd_html_str.format( title=project.name, pre_content=self.content_str, project_name=project.name, author=project.create_user.first_name if project.create_user.first_name != '' else project.create_user.username, create_time=str(datetime.date.today()) ) ) # 执行HTML转PDF try: convert('file://'+temp_file_path,report_file_path) except: logger.exception(_("生成PDF出错")) return False # 处理PDF文件 if os.path.exists(report_file_path): os.remove(temp_file_path) return report_file_path else: return False # 导出Docx class ReportDocx(): def __init__(self,project_id): self.project = Project.objects.get(id=project_id) self.base_path = settings.MEDIA_ROOT + '/report/{}/'.format(project_id) self.content_str = "" self.doc_str = """ Print """ def work(self): # 拼接HTML字符串 data = Doc.objects.filter(top_doc=self.project.id,parent_doc=0,status=1).order_by("sort") for d in data: # print(d.name,d.content) self.content_str += "

{}

".format(d.name) self.content_str += d.content # 获取第二级文档 data_2 = Doc.objects.filter(parent_doc=d.id).order_by("sort") for d2 in data_2: self.content_str += "

{}

".format(d2.name) self.content_str += d2.content # 获取第三级文档 data_3 = Doc.objects.filter(parent_doc=d2.id).order_by("sort") for d3 in data_3: # print(d3.name,d3.content) self.content_str += "

{}

".format(d3.name) self.content_str += d3.content # 使用BeautifulSoup解析拼接好的HTML文本 soup = BeautifulSoup(self.content_str,'lxml') src_tag = soup.find_all(lambda tag:tag.has_attr("src")) # 查找所有包含src的标签 print(src_tag) # 替换HTML文本中静态文件的相对链接为绝对链接 for src in src_tag: if src['src'].startswith("/"): src['src'] = settings.BASE_DIR + src['src'] is_folder = os.path.exists(self.base_path) # 创建文件夹 if is_folder is False: os.mkdir(self.base_path) temp_file_name = str(datetime.datetime.today()).replace(':', '-').replace(' ', '-').replace('.', '') temp_file_path = self.base_path + '/{0}.docx'.format(temp_file_name) with open(temp_file_path, 'a+', encoding='utf-8') as htmlfile: htmlfile.write(self.doc_str + self.content_str + "") if __name__ == '__main__': # app = ReportMD( # project_id=7 # ) # app.work() # app = ReportEPUB(project_id=20) # app.work() app = ReportPDF(project_id=20) app.work() # app = ReportDocx(project_id=20) # app.work()