git.samba.org - rsync.git/blob

1 #!/usr/bin/env python3

3 # This script transforms markdown files into html and (optionally) nroff. The

4 # output files are written into the current directory named for the input file

5 # without the .md suffix and either the .html suffix or no suffix.

6 #

7 # If the input .md file has a section number at the end of the name (e.g.,

8 # rsync.1.md) a nroff file is also output (PROJ.NUM.md -> PROJ.NUM).

9 #

10 # The markdown input format has one extra extension: if a numbered list starts

11 # at 0, it is turned into a description list. The dl's dt tag is taken from the

12 # contents of the first tag inside the li, which is usually a p, code, or

13 # strong tag.

14 #

15 # The cmarkgfm or commonmark lib is used to transforms the input file into

16 # html. Then, the html.parser is used as a state machine that lets us tweak

17 # the html and (optionally) output nroff data based on the html tags.

18 #

19 # If the string @USE_GFM_PARSER@ exists in the file, the string is removed and

20 # a github-flavored-markup parser is used to parse the file.

21 #

22 # The man-page .md files also get the vars @VERSION@, @BINDIR@, and @LIBDIR@

23 # substituted. Some of these values depend on the Makefile $(prefix) (see the

24 # generated Makefile). If the maintainer wants to build files for /usr/local

25 # while creating release-ready man-page files for /usr, use the environment to

26 # set RSYNC_OVERRIDE_PREFIX=/usr.

29 #

30 # This program is freely redistributable.

32 import os, sys, re, argparse, subprocess, time

33 from html.parser import HTMLParser

35 CONSUMES_TXT = set('h1 h2 p li pre'.split())

37 HTML_START = """\

38 <html><head>

39 <title>%s</title>

40 <link href="https://fonts.googleapis.com/css2?family=Roboto&family=Roboto+Mono&display=swap" rel="stylesheet">

41 <style>

42 body {

43 max-width: 50em;

44 margin: auto;

45 }

46 body, b, strong, u {

47 font-family: 'Roboto', sans-serif;

48 }

49 code {

50 font-family: 'Roboto Mono', monospace;

51 font-weight: bold;

52 white-space: pre;

53 }

54 pre code {

55 display: block;

56 font-weight: normal;

57 }

58 blockquote pre code {

59 background: #f1f1f1;

60 }

61 dd p:first-of-type {

62 margin-block-start: 0em;

63 }

64 </style>

65 </head><body>

66 """

68 TABLE_STYLE = """\

69 table {

70 border-color: grey;

71 border-spacing: 0;

72 }

73 tr {

74 border-top: 1px solid grey;

75 }

76 tr:nth-child(2n) {

77 background-color: #f6f8fa;

78 }

79 th, td {

80 border: 1px solid #dfe2e5;

81 text-align: center;

82 padding-left: 1em;

83 padding-right: 1em;

84 }

85 """

87 MAN_HTML_END = """\

88 <div style="float: right"><p><i>%s</i></p></div>

89 """

91 HTML_END = """\

92 </body></html>

93 """

95 MAN_START = r"""

96 .TH "%s" "%s" "%s" "%s" "User Commands"

97 .\" prefix=%s

98 """.lstrip()

100 MAN_END = """\

101 """

102

103 NORM_FONT = ('\1', r"\fP")

104 BOLD_FONT = ('\2', r"\fB")

105 UNDR_FONT = ('\3', r"\fI")

106 NBR_DASH = ('\4', r"\-")

107 NBR_SPACE = ('\xa0', r"\ ")

108

109 md_parser = None

110 env_subs = { }

111

112 def main():

113 for mdfn in args.mdfiles:

114 parse_md_file(mdfn)

115

116 if args.test:

117 print("The test was successful.")

118

119

120 def parse_md_file(mdfn):

121 fi = re.match(r'^(?P<fn>(?P<srcdir>.+/)?(?P<name>(?P<prog>[^/]+?)(\.(?P<sect>\d+))?)\.md)$', mdfn)

122 if not fi:

123 die('Failed to parse a md input file name:', mdfn)

124 fi = argparse.Namespace(**fi.groupdict())

125 fi.want_manpage = not not fi.sect

126 if fi.want_manpage:

127 fi.title = fi.prog + '(' + fi.sect + ') man page'

128 else:

129 fi.title = fi.prog

130

131 if fi.want_manpage:

132 if not env_subs:

133 find_man_substitutions()

134 prog_ver = 'rsync ' + env_subs['VERSION']

135 if fi.prog != 'rsync':

136 prog_ver = fi.prog + ' from ' + prog_ver

137 fi.man_headings = (fi.prog, fi.sect, env_subs['date'], prog_ver, env_subs['prefix'])

138

139 with open(mdfn, 'r', encoding='utf-8') as fh:

140 txt = fh.read()

141

142 use_gfm_parser = '@USE_GFM_PARSER@' in txt

143 if use_gfm_parser:

144 txt = txt.replace('@USE_GFM_PARSER@', '')

145

146 if fi.want_manpage:

147 txt = (txt.replace('@VERSION@', env_subs['VERSION'])

148 .replace('@BINDIR@', env_subs['bindir'])

149 .replace('@LIBDIR@', env_subs['libdir']))

150

151 if use_gfm_parser:

152 if not gfm_parser:

153 die('Input file requires cmarkgfm parser:', mdfn)

154 fi.html_in = gfm_parser(txt)

155 else:

156 fi.html_in = md_parser(txt)

157 txt = None

158

159 TransformHtml(fi)

160

161 if args.test:

162 return

163

164 output_list = [ (fi.name + '.html', fi.html_out) ]

165 if fi.want_manpage:

166 output_list += [ (fi.name, fi.man_out) ]

167 for fn, txt in output_list:

168 if os.path.lexists(fn):

169 os.unlink(fn)

170 print("Wrote:", fn)

171 with open(fn, 'w', encoding='utf-8') as fh:

172 fh.write(txt)

173

174

175 def find_man_substitutions():

176 srcdir = os.path.dirname(sys.argv[0]) + '/'

177 mtime = 0

178

179 git_dir = srcdir + '.git'

180 if os.path.lexists(git_dir):

181 mtime = int(subprocess.check_output(['git', '--git-dir', git_dir, 'log', '-1', '--format=%at']))

182

183 # Allow "prefix" to be overridden via the environment:

184 env_subs['prefix'] = os.environ.get('RSYNC_OVERRIDE_PREFIX', None)

185

186 if args.test:

187 env_subs['VERSION'] = '1.0.0'

188 env_subs['bindir'] = '/usr/bin'

189 env_subs['libdir'] = '/usr/lib/rsync'

190 else:

191 for fn in (srcdir + 'version.h', 'Makefile'):

192 try:

193 st = os.lstat(fn)

194 except OSError:

195 die('Failed to find', srcdir + fn)

196 if not mtime:

197 mtime = st.st_mtime

198

199 with open(srcdir + 'version.h', 'r', encoding='utf-8') as fh:

200 txt = fh.read()

201 m = re.search(r'"(.+?)"', txt)

202 env_subs['VERSION'] = m.group(1)

203

204 with open('Makefile', 'r', encoding='utf-8') as fh:

205 for line in fh:

206 m = re.match(r'^(\w+)=(.+)', line)

207 if not m:

208 continue

209 var, val = (m.group(1), m.group(2))

210 if var == 'prefix' and env_subs[var] is not None:

211 continue

212 while re.search(r'\$\{', val):

213 val = re.sub(r'\$\{(\w+)\}', lambda m: env_subs[m.group(1)], val)

214 env_subs[var] = val

215 if var == 'srcdir':

216 break

217

218 env_subs['date'] = time.strftime('%d %b %Y', time.localtime(mtime))

219

220

221 def html_via_commonmark(txt):

222 return commonmark.HtmlRenderer().render(commonmark.Parser().parse(txt))

223

224

225 class TransformHtml(HTMLParser):

226 def __init__(self, fi):

227 HTMLParser.__init__(self, convert_charrefs=True)

228

229 st = self.state = argparse.Namespace(

230 list_state = [ ],

231 p_macro = ".P\n",

232 at_first_tag_in_li = False,

233 at_first_tag_in_dd = False,

234 dt_from = None,

235 in_pre = False,

236 in_code = False,

237 html_out = [ HTML_START % fi.title ],

238 man_out = [ ],

239 txt = '',

240 want_manpage = fi.want_manpage,

241 )

242

243 if st.want_manpage:

244 st.man_out.append(MAN_START % fi.man_headings)

245

246 if '</table>' in fi.html_in:

247 st.html_out[0] = st.html_out[0].replace('</style>', TABLE_STYLE + '</style>')

248

249 self.feed(fi.html_in)

250 fi.html_in = None

251

252 if st.want_manpage:

253 st.html_out.append(MAN_HTML_END % env_subs['date'])

254 st.html_out.append(HTML_END)

255 st.man_out.append(MAN_END)

256

257 fi.html_out = ''.join(st.html_out)

258 st.html_out = None

259

260 fi.man_out = ''.join(st.man_out)

261 st.man_out = None

262

263

264 def handle_starttag(self, tag, attrs_list):

265 st = self.state

266 if args.debug:

267 self.output_debug('START', (tag, attrs_list))

268 if st.at_first_tag_in_li:

269 if st.list_state[-1] == 'dl':

270 st.dt_from = tag

271 if tag == 'p':

272 tag = 'dt'

273 else:

274 st.html_out.append('<dt>')

275 elif tag == 'p':

276 st.at_first_tag_in_dd = True # Kluge to suppress a .P at the start of an li.

277 st.at_first_tag_in_li = False

278 if tag == 'p':

279 if not st.at_first_tag_in_dd:

280 st.man_out.append(st.p_macro)

281 elif tag == 'li':

282 st.at_first_tag_in_li = True

283 lstate = st.list_state[-1]

284 if lstate == 'dl':

285 return

286 if lstate == 'o':

287 st.man_out.append(".IP o\n")

288 else:

289 st.man_out.append(".IP " + str(lstate) + ".\n")

290 st.list_state[-1] += 1

291 elif tag == 'blockquote':

292 st.man_out.append(".RS 4\n")

293 elif tag == 'pre':

294 st.in_pre = True

295 st.man_out.append(st.p_macro + ".nf\n")

296 elif tag == 'code' and not st.in_pre:

297 st.in_code = True

298 st.txt += BOLD_FONT[0]

299 elif tag == 'strong' or tag == 'b':

300 st.txt += BOLD_FONT[0]

301 elif tag == 'em' or tag == 'i':

302 if st.want_manpage:

303 tag = 'u' # Change it into underline to be more like the man page

304 st.txt += UNDR_FONT[0]

305 elif tag == 'ol':

306 start = 1

307 for var, val in attrs_list:

308 if var == 'start':

309 start = int(val) # We only support integers.

310 break

311 if st.list_state:

312 st.man_out.append(".RS\n")

313 if start == 0:

314 tag = 'dl'

315 attrs_list = [ ]

316 st.list_state.append('dl')

317 else:

318 st.list_state.append(start)

319 st.man_out.append(st.p_macro)

320 st.p_macro = ".IP\n"

321 elif tag == 'ul':

322 st.man_out.append(st.p_macro)

323 if st.list_state:

324 st.man_out.append(".RS\n")

325 st.p_macro = ".IP\n"

326 st.list_state.append('o')

327 elif tag == 'hr':

328 st.man_out.append(".l\n")

329 st.html_out.append("<hr />")

330 return

331 st.html_out.append('<' + tag + ''.join(' ' + var + '="' + htmlify(val) + '"' for var, val in attrs_list) + '>')

332 st.at_first_tag_in_dd = False

333

334

335 def handle_endtag(self, tag):

336 st = self.state

337 if args.debug:

338 self.output_debug('END', (tag,))

339 if tag in CONSUMES_TXT or st.dt_from == tag:

340 txt = st.txt.strip()

341 st.txt = ''

342 else:

343 txt = None

344 add_to_txt = None

345 if tag == 'h1':

346 st.man_out.append(st.p_macro + '.SH "' + manify(txt) + '"\n')

347 elif tag == 'h2':

348 st.man_out.append(st.p_macro + '.SS "' + manify(txt) + '"\n')

349 elif tag == 'p':

350 if st.dt_from == 'p':

351 tag = 'dt'

352 st.man_out.append('.IP "' + manify(txt) + '"\n')

353 st.dt_from = None

354 elif txt != '':

355 st.man_out.append(manify(txt) + "\n")

356 elif tag == 'li':

357 if st.list_state[-1] == 'dl':

358 if st.at_first_tag_in_li:

359 die("Invalid 0. -> td translation")

360 tag = 'dd'

361 if txt != '':

362 st.man_out.append(manify(txt) + "\n")

363 st.at_first_tag_in_li = False

364 elif tag == 'blockquote':

365 st.man_out.append(".RE\n")

366 elif tag == 'pre':

367 st.in_pre = False

368 st.man_out.append(manify(txt) + "\n.fi\n")

369 elif (tag == 'code' and not st.in_pre):

370 st.in_code = False

371 add_to_txt = NORM_FONT[0]

372 elif tag == 'strong' or tag == 'b':

373 add_to_txt = NORM_FONT[0]

374 elif tag == 'em' or tag == 'i':

375 if st.want_manpage:

376 tag = 'u' # Change it into underline to be more like the man page

377 add_to_txt = NORM_FONT[0]

378 elif tag == 'ol' or tag == 'ul':

379 if st.list_state.pop() == 'dl':

380 tag = 'dl'

381 if st.list_state:

382 st.man_out.append(".RE\n")

383 else:

384 st.p_macro = ".P\n"

385 st.at_first_tag_in_dd = False

386 elif tag == 'hr':

387 return

388 st.html_out.append('</' + tag + '>')

389 if add_to_txt:

390 if txt is None:

391 st.txt += add_to_txt

392 else:

393 txt += add_to_txt

394 if st.dt_from == tag:

395 st.man_out.append('.IP "' + manify(txt) + '"\n')

396 st.html_out.append('</dt><dd>')

397 st.at_first_tag_in_dd = True

398 st.dt_from = None

399 elif tag == 'dt':

400 st.html_out.append('<dd>')

401 st.at_first_tag_in_dd = True

402

403

404 def handle_data(self, txt):

405 st = self.state

406 if args.debug:

407 self.output_debug('DATA', (txt,))

408 if st.in_pre:

409 html = htmlify(txt)

410 else:

411 txt = re.sub(r'\s--(\s)', NBR_SPACE[0] + r'--\1', txt).replace('--', NBR_DASH[0]*2)

412 txt = re.sub(r'(^|\W)-', r'\1' + NBR_DASH[0], txt)

413 html = htmlify(txt)

414 if st.in_code:

415 txt = re.sub(r'\s', NBR_SPACE[0], txt)

416 html = html.replace(NBR_DASH[0], '-').replace(NBR_SPACE[0], ' ') # <code> is non-breaking in CSS

417 st.html_out.append(html.replace(NBR_SPACE[0], ' ').replace(NBR_DASH[0], '-⁠'))

418 st.txt += txt

419

420

421 def output_debug(self, event, extra):

422 import pprint

423 st = self.state

424 if args.debug < 2:

425 st = argparse.Namespace(**vars(st))

426 if len(st.html_out) > 2:

427 st.html_out = ['...'] + st.html_out[-2:]

428 if len(st.man_out) > 2:

429 st.man_out = ['...'] + st.man_out[-2:]

430 print(event, extra)

431 pprint.PrettyPrinter(indent=2).pprint(vars(st))

432

433

434 def manify(txt):

435 return re.sub(r"^(['.])", r'\&\1', txt.replace('\\', '\\\\')

436 .replace(NBR_SPACE[0], NBR_SPACE[1])

437 .replace(NBR_DASH[0], NBR_DASH[1])

438 .replace(NORM_FONT[0], NORM_FONT[1])

439 .replace(BOLD_FONT[0], BOLD_FONT[1])

440 .replace(UNDR_FONT[0], UNDR_FONT[1]), flags=re.M)

441

442

443 def htmlify(txt):

444 return txt.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')

445

446

447 def warn(*msg):

448 print(*msg, file=sys.stderr)

449

450

451 def die(*msg):

452 warn(*msg)

453 sys.exit(1)

454

455

456 if __name__ == '__main__':

457 parser = argparse.ArgumentParser(description="Output html and (optionally) nroff for markdown pages.", add_help=False)

458 parser.add_argument('--test', action='store_true', help="Just test the parsing without outputting any files.")

459 parser.add_argument('--debug', '-D', action='count', default=0, help='Output copious info on the html parsing. Repeat for even more.')

460 parser.add_argument("--help", "-h", action="help", help="Output this help message and exit.")

461 parser.add_argument("mdfiles", nargs='+', help="The source .md files to convert.")

462 args = parser.parse_args()

463

464 try:

465 import cmarkgfm

466 md_parser = cmarkgfm.markdown_to_html

467 gfm_parser = cmarkgfm.github_flavored_markdown_to_html

468 except:

469 try:

470 import commonmark

471 md_parser = html_via_commonmark

472 except:

473 die("Failed to find cmarkgfm or commonmark for python3.")

474 gfm_parser = None

475

476 main()