使⽤Python3将Markdown(.md)⽂本转换成html、pdf ⼀、Markdown中不同的⽂本内容会分成不同的⽂本块,并通过markdown的语法控制进⾏⽂本的拼接,组成新的⽂件。
⼆、利⽤Python3实现(.md)⽂件转换成(.html)⽂件
在cmd命令⾏下进⼊(.py)⽂件⽬录下,使⽤命令进⾏执⾏
>python md2html.py <file.md> <file.html>
import sys, re
#⽣成器模块
def lines(file):
#在⽂本最后加⼀空⾏
for line in file: yield line
yield'\n'
def blocks(file):
#⽣成单独的⽂本块
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield''.join(block).strip()
block = []
#⽂本块处理程序
class Handler:
"""
处理程序⽗类
"""
def callback(self, prefix, name, *args):
method = getattr(self, prefix + name, None)
if callable(method): return method(*args)
def start(self, name):
self.callback('start_', name)
def end(self, name):
self.callback('end_', name)
def sub(self, name):
def substitution(match):
result = self.callback('sub_', name, match)
if result is None: result = up(0)
return result
return substitution
class HTMLRenderer(Handler):
"""
HTML处理程序,给⽂本块加相应的HTML标记
"""
def start_document(self):
print('<html><head><title>Python⽂本解析</title></head><body>')
def end_document(self):
print('</body></html>')
def start_paragraph(self):
print('<p >')
def end_paragraph(self):
print('</p>')
def start_heading(self):
print('<h2 >')
将html代码显示为文本def end_heading(self):
print('</h2>')
def start_list(self):
print('<ul >')
def end_list(self):
print('</ul>')
def start_listitem(self):
print('<li>')
def end_listitem(self):
print('</li>')
def start_title(self):
print('<h1 >')
def end_title(self):
print('</h1>')
def sub_emphasis(self, match):
return('<em>%s</em>' % up(1))
def sub_url(self, match):
return('<a target="_blank" href="%s">%s</a>' % (up(1), up(1))) def sub_mail(self, match):
return('<a href="mailto:%s">%s</a>' % (up(1), up(1)))
def feed(self, data):
print(data)
#规则,判断每个⽂本块应该如何处理
class Rule:
"""
规则⽗类
"""
def action(self, block, handler):
"""
加标记
"""
handler.pe)
handler.feed(block)
return True
class HeadingRule(Rule):
"""
⼀号标题规则
"""
type = 'heading'
def condition(self, block):
"""
判断⽂本块是否符合规则
"""
return not'\n'in block and len(block) <= 70 and not block[-1] == ':'
class TitleRule(HeadingRule):
"""
⼆号标题规则
"""
type = 'title'
first = True
def condition(self, block):
if not self.first: return False
self.first = False
dition(self, block)
class ListItemRule(Rule):
"""
列表项规则
"""
type = 'listitem'
def condition(self, block):
return block[0] == '-'
def action(self, block, handler):
handler.pe)
handler.feed(block[1:].strip())
return True
class ListRule(ListItemRule):
"""
列表规则
"""
type = 'list'
inside = False
def condition(self, block):
return True
def action(self, block, handler):
if not self.inside dition(self, block): handler.pe)
self.inside = True
elif self.inside and dition(self, block): pe)
self.inside = False
return False
class ParagraphRule(Rule):
"""
段落规则
"""
type = 'paragraph'
def condition(self, block):
return True
class Code(Rule):
'''
代码框规则
⾼亮显⽰规则
。
。。
'''
pass
# 对整个⽂本进⾏解析
class Parser:
"""
解析器⽗类
"""
def__init__(self, handler):
self.handler = handler
self.rules = []
self.filters = []
def addRule(self, rule):
"""
添加规则
"""
self.rules.append(rule)
def addFilter(self, pattern, name):
"""
添加过滤器
"""
def filter(block, handler):
return re.sub(pattern, handler.sub(name), block)
self.filters.append(filter)
def parse(self, file):
"""
解析
"""
self.handler.start('document')
for block in blocks(file):
for filter in self.filters:
block = filter(block, self.handler)
for rule in self.rules:
dition(block):
last = rule.action(block, self.handler)
if last: break
d('document')
class BasicTextParser(Parser):
"""
纯⽂本解析器
"""
def__init__(self, handler):
Parser.__init__(self, handler)
self.addRule(ListRule())
self.addRule(ListItemRule())
self.addRule(TitleRule())
self.addRule(HeadingRule())
self.addRule(ParagraphRule())
self.addFilter(r'\*(.+?)\*', 'emphasis')
self.addFilter(r'([\.a-zA-Z/]+)', 'url')
self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail') """
运⾏测试程序
"""
handler = HTMLRenderer()
parser = BasicTextParser(handler)
parser.parse(sys.stdin)
三、利⽤Python3将⽂本转化成pdf⽂件
命令>python md2pdf.py 源⽂件⽬标⽂件 [options]
Options:
-h --help show help document.
-v --version show version information.
-o --output translate sourcefile into html file.
-p --print translate sourcefile into pdf file and html file respectively.
-P --Print translate sourcefile into pdf file only.
import os,re
import sys,getopt
from enum import Enum
from subprocess import call
from functools import reduce
from docopt import docopt
__version__ = '1.0'
# 定义三个枚举类
# 定义表状态
class TABLE(Enum):
Init = 1
Format = 2
Table = 3
# 有序序列状态
class ORDERLIST(Enum):
Init = 1
List = 2
# 块状态
class BLOCK(Enum):
Init = 1
Block = 2
CodeBlock = 3
# 定义全局状态,并初始化状态
table_state = TABLE.Init
orderList_state = ORDERLIST.Init
block_state = BLOCK.Init
is_code = False
is_normal = True
temp_table_first_line = []
temp_table_first_line_str = ""
need_mathjax = False
def test_state(input):
global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str
Code_List = ["python\n", "c++\n", "c\n"]
result = input
# 构建正则表达式规则
# 匹配块标识
pattern = repile(r'```(\s)*\n')
a = pattern.match(input)
# 普通块
if a and block_state == BLOCK.Init:
result = "<blockquote>"
block_state = BLOCK.Block
is_normal = False
# 特殊代码块
elif len(input) > 4 and input[0:3] == '```'and (input[3:9] == "python"or input[3:6] == "c++"or input[3:4]== "c") and block_state == BLOCK.Init: block_state = BLOCK.Block
result = "<code></br>"
is_code = True
is_normal = False
# 块结束
elif block_state == BLOCK.Block and input == '```\n':
if is_code:
result = "</code>"
else:
result = "</blockquote>"
block_state = BLOCK.Init
is_code = False
is_normal = False
elif block_state == BLOCK.Block:
pattern = repile(r'[\n\r\v\f\ ]')
result = pattern.sub(" ", result)
pattern = repile(r'\t')
result = pattern.sub(" " * 4, result)
result = "<span>" + result + "</span></br>"
is_normal = False
# 解析有序序列
if len(input) > 2 and input[0].isdigit() and input[1] == '.'and orderList_state == ORDERLIST.Init:
orderList_state = ORDERLIST.List
result = "<ol><li>" + input[2:] + "</li>"
is_normal = False
elif len(input) > 2 and input[0].isdigit() and input[1] == '.'and orderList_state == ORDERLIST.List:
result = "<li>" + input[2:] + "</li>"
is_normal = False
elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'): result = "</ol>" + input
orderList_state = ORDERLIST.Init
# 解析表格
pattern = repile(r'^((.+)\|)+((.+))$')
match = pattern.match(input)
if match:
l = input.split('|')
l[-1] = l[-1][:-1]
# 将空字符弹出列表
if l[0] == '':
l.pop(0)
if l[-1] == '':
l.pop(-1)
if table_state == TABLE.Init:
table_state = TABLE.Format
temp_table_first_line = l
temp_table_first_line_str = input
result = ""
elif table_state == TABLE.Format:
# 如果是表头与表格主题的分割线
if reduce(lambda a, b: a and b, [all_same(i,'-') for i in l], True):
table_state = TABLE.Table
result = "<table><thread><tr>"
is_normal = False
# 添加表头
for i in temp_table_first_line:
result += "<th>" + i + "</th>"
result += "</tr>"
result += "</thread><tbody>"
is_normal = False
else:
result = temp_table_first_line_str + "</br>" + input
table_state = TABLE.Init
elif table_state == TABLE.Table:
result = "<tr>"
for i in l:
result += "<td>" + i + "</td>"
result += "</tr>"
elif table_state == TABLE.Table:
table_state = TABLE.Init
result = "</tbody></table>" + result
elif table_state == TABLE.Format:
pass
return result
# 判断 lst 是否全由字符 sym 构成
def all_same(lst, sym):
return not lst or sym * len(lst) == lst
# 处理标题
def handleTitle(s, n):
temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"
return temp
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论