all repos — cgit @ 6d6f8bdeed0b47422337c430fdaf236b978f6c44

a hyperfast web frontend for git written in c

filters/html-converters/resources/markdown.pl (view raw)

   1#!/usr/bin/perl
   2
   3#
   4# Markdown -- A text-to-HTML conversion tool for web writers
   5#
   6# Copyright (c) 2004 John Gruber
   7# <http://daringfireball.net/projects/markdown/>
   8#
   9
  10
  11package Markdown;
  12require 5.006_000;
  13use strict;
  14use warnings;
  15
  16use Digest::MD5 qw(md5_hex);
  17use vars qw($VERSION);
  18$VERSION = '1.0.1';
  19# Tue 14 Dec 2004
  20
  21
  22#
  23# Global default settings:
  24#
  25my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
  26my $g_tab_width = 4;
  27
  28
  29#
  30# Globals:
  31#
  32
  33# Regex to match balanced [brackets]. See Friedl's
  34# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  35my $g_nested_brackets;
  36$g_nested_brackets = qr{
  37	(?> 								# Atomic matching
  38	   [^\[\]]+							# Anything other than brackets
  39	 | 
  40	   \[
  41		 (??{ $g_nested_brackets })		# Recursive set of nested brackets
  42	   \]
  43	)*
  44}x;
  45
  46
  47# Table of hash values for escaped characters:
  48my %g_escape_table;
  49foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  50	$g_escape_table{$char} = md5_hex($char);
  51}
  52
  53
  54# Global hashes, used by various utility routines
  55my %g_urls;
  56my %g_titles;
  57my %g_html_blocks;
  58
  59# Used to track when we're inside an ordered or unordered list
  60# (see _ProcessListItems() for details):
  61my $g_list_level = 0;
  62
  63
  64#### Blosxom plug-in interface ##########################################
  65
  66# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
  67# which posts Markdown should process, using a "meta-markup: markdown"
  68# header. If it's set to 0 (the default), Markdown will process all
  69# entries.
  70my $g_blosxom_use_meta = 0;
  71
  72sub start { 1; }
  73sub story {
  74	my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  75
  76	if ( (! $g_blosxom_use_meta) or
  77	     (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  78	     ){
  79			$$body_ref  = Markdown($$body_ref);
  80     }
  81     1;
  82}
  83
  84
  85#### Movable Type plug-in interface #####################################
  86eval {require MT};  # Test to see if we're running in MT.
  87unless ($@) {
  88    require MT;
  89    import  MT;
  90    require MT::Template::Context;
  91    import  MT::Template::Context;
  92
  93	eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
  94	unless ($@) {
  95		require MT::Plugin;
  96		import  MT::Plugin;
  97		my $plugin = new MT::Plugin({
  98			name => "Markdown",
  99			description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 100			doc_link => 'http://daringfireball.net/projects/markdown/'
 101		});
 102		MT->add_plugin( $plugin );
 103	}
 104
 105	MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 106		my $ctx	 = shift;
 107		my $args = shift;
 108		my $builder = $ctx->stash('builder');
 109		my $tokens = $ctx->stash('tokens');
 110
 111		if (defined ($args->{'output'}) ) {
 112			$ctx->stash('markdown_output', lc $args->{'output'});
 113		}
 114
 115		defined (my $str = $builder->build($ctx, $tokens) )
 116			or return $ctx->error($builder->errstr);
 117		$str;		# return value
 118	});
 119
 120	MT->add_text_filter('markdown' => {
 121		label     => 'Markdown',
 122		docs      => 'http://daringfireball.net/projects/markdown/',
 123		on_format => sub {
 124			my $text = shift;
 125			my $ctx  = shift;
 126			my $raw  = 0;
 127		    if (defined $ctx) {
 128		    	my $output = $ctx->stash('markdown_output'); 
 129				if (defined $output  &&  $output =~ m/^html/i) {
 130					$g_empty_element_suffix = ">";
 131					$ctx->stash('markdown_output', '');
 132				}
 133				elsif (defined $output  &&  $output eq 'raw') {
 134					$raw = 1;
 135					$ctx->stash('markdown_output', '');
 136				}
 137				else {
 138					$raw = 0;
 139					$g_empty_element_suffix = " />";
 140				}
 141			}
 142			$text = $raw ? $text : Markdown($text);
 143			$text;
 144		},
 145	});
 146
 147	# If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 148	my $smartypants;
 149
 150	{
 151		no warnings "once";
 152		$smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 153	}
 154
 155	if ($smartypants) {
 156		MT->add_text_filter('markdown_with_smartypants' => {
 157			label     => 'Markdown With SmartyPants',
 158			docs      => 'http://daringfireball.net/projects/markdown/',
 159			on_format => sub {
 160				my $text = shift;
 161				my $ctx  = shift;
 162				if (defined $ctx) {
 163					my $output = $ctx->stash('markdown_output'); 
 164					if (defined $output  &&  $output eq 'html') {
 165						$g_empty_element_suffix = ">";
 166					}
 167					else {
 168						$g_empty_element_suffix = " />";
 169					}
 170				}
 171				$text = Markdown($text);
 172				$text = $smartypants->($text, '1');
 173			},
 174		});
 175	}
 176}
 177else {
 178#### BBEdit/command-line text filter interface ##########################
 179# Needs to be hidden from MT (and Blosxom when running in static mode).
 180
 181    # We're only using $blosxom::version once; tell Perl not to warn us:
 182	no warnings 'once';
 183    unless ( defined($blosxom::version) ) {
 184		use warnings;
 185
 186		#### Check for command-line switches: #################
 187		my %cli_opts;
 188		use Getopt::Long;
 189		Getopt::Long::Configure('pass_through');
 190		GetOptions(\%cli_opts,
 191			'version',
 192			'shortversion',
 193			'html4tags',
 194		);
 195		if ($cli_opts{'version'}) {		# Version info
 196			print "\nThis is Markdown, version $VERSION.\n";
 197			print "Copyright 2004 John Gruber\n";
 198			print "http://daringfireball.net/projects/markdown/\n\n";
 199			exit 0;
 200		}
 201		if ($cli_opts{'shortversion'}) {		# Just the version number string.
 202			print $VERSION;
 203			exit 0;
 204		}
 205		if ($cli_opts{'html4tags'}) {			# Use HTML tag style instead of XHTML
 206			$g_empty_element_suffix = ">";
 207		}
 208
 209
 210		#### Process incoming text: ###########################
 211		my $text;
 212		{
 213			local $/;               # Slurp the whole file
 214			$text = <>;
 215		}
 216	print <<'EOT';
 217<style>
 218.markdown-body {
 219    font-size: 14px;
 220    line-height: 1.6;
 221    overflow: hidden;
 222}
 223.markdown-body>*:first-child {
 224    margin-top: 0 !important;
 225}
 226.markdown-body>*:last-child {
 227    margin-bottom: 0 !important;
 228}
 229.markdown-body a.absent {
 230    color: #c00;
 231}
 232.markdown-body a.anchor {
 233    display: block;
 234    padding-left: 30px;
 235    margin-left: -30px;
 236    cursor: pointer;
 237    position: absolute;
 238    top: 0;
 239    left: 0;
 240    bottom: 0;
 241}
 242.markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
 243    margin: 20px 0 10px;
 244    padding: 0;
 245    font-weight: bold;
 246    -webkit-font-smoothing: antialiased;
 247    cursor: text;
 248    position: relative;
 249}
 250.markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
 251    display: none;
 252    color: #000;
 253}
 254.markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
 255    text-decoration: none;
 256    line-height: 1;
 257    padding-left: 0;
 258    margin-left: -22px;
 259    top: 15%}
 260.markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
 261    display: inline-block;
 262}
 263.markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
 264    font-size: inherit;
 265}
 266.markdown-body h1 {
 267    font-size: 28px;
 268    color: #000;
 269}
 270.markdown-body h2 {
 271    font-size: 24px;
 272    border-bottom: 1px solid #ccc;
 273    color: #000;
 274}
 275.markdown-body h3 {
 276    font-size: 18px;
 277}
 278.markdown-body h4 {
 279    font-size: 16px;
 280}
 281.markdown-body h5 {
 282    font-size: 14px;
 283}
 284.markdown-body h6 {
 285    color: #777;
 286    font-size: 14px;
 287}
 288.markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
 289    margin: 15px 0;
 290}
 291.markdown-body hr {
 292    background: transparent url("/dirty-shade.png") repeat-x 0 0;
 293    border: 0 none;
 294    color: #ccc;
 295    height: 4px;
 296    padding: 0;
 297}
 298.markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
 299    margin-top: 0;
 300    padding-top: 0;
 301}
 302.markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
 303    margin-top: 0;
 304    padding-top: 0;
 305}
 306.markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
 307    margin-top: 0;
 308}
 309.markdown-body li p.first {
 310    display: inline-block;
 311}
 312.markdown-body ul, .markdown-body ol {
 313    padding-left: 30px;
 314}
 315.markdown-body ul.no-list, .markdown-body ol.no-list {
 316    list-style-type: none;
 317    padding: 0;
 318}
 319.markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
 320    margin-top: 0px;
 321}
 322.markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
 323    margin-bottom: 0;
 324}
 325.markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
 326    margin-bottom: 0;
 327}
 328.markdown-body dl {
 329    padding: 0;
 330}
 331.markdown-body dl dt {
 332    font-size: 14px;
 333    font-weight: bold;
 334    font-style: italic;
 335    padding: 0;
 336    margin: 15px 0 5px;
 337}
 338.markdown-body dl dt:first-child {
 339    padding: 0;
 340}
 341.markdown-body dl dt>:first-child {
 342    margin-top: 0px;
 343}
 344.markdown-body dl dt>:last-child {
 345    margin-bottom: 0px;
 346}
 347.markdown-body dl dd {
 348    margin: 0 0 15px;
 349    padding: 0 15px;
 350}
 351.markdown-body dl dd>:first-child {
 352    margin-top: 0px;
 353}
 354.markdown-body dl dd>:last-child {
 355    margin-bottom: 0px;
 356}
 357.markdown-body blockquote {
 358    border-left: 4px solid #DDD;
 359    padding: 0 15px;
 360    color: #777;
 361}
 362.markdown-body blockquote>:first-child {
 363    margin-top: 0px;
 364}
 365.markdown-body blockquote>:last-child {
 366    margin-bottom: 0px;
 367}
 368.markdown-body table th {
 369    font-weight: bold;
 370}
 371.markdown-body table th, .markdown-body table td {
 372    border: 1px solid #ccc;
 373    padding: 6px 13px;
 374}
 375.markdown-body table tr {
 376    border-top: 1px solid #ccc;
 377    background-color: #fff;
 378}
 379.markdown-body table tr:nth-child(2n) {
 380    background-color: #f8f8f8;
 381}
 382.markdown-body img {
 383    max-width: 100%;
 384    -moz-box-sizing: border-box;
 385    box-sizing: border-box;
 386}
 387.markdown-body span.frame {
 388    display: block;
 389    overflow: hidden;
 390}
 391.markdown-body span.frame>span {
 392    border: 1px solid #ddd;
 393    display: block;
 394    float: left;
 395    overflow: hidden;
 396    margin: 13px 0 0;
 397    padding: 7px;
 398    width: auto;
 399}
 400.markdown-body span.frame span img {
 401    display: block;
 402    float: left;
 403}
 404.markdown-body span.frame span span {
 405    clear: both;
 406    color: #333;
 407    display: block;
 408    padding: 5px 0 0;
 409}
 410.markdown-body span.align-center {
 411    display: block;
 412    overflow: hidden;
 413    clear: both;
 414}
 415.markdown-body span.align-center>span {
 416    display: block;
 417    overflow: hidden;
 418    margin: 13px auto 0;
 419    text-align: center;
 420}
 421.markdown-body span.align-center span img {
 422    margin: 0 auto;
 423    text-align: center;
 424}
 425.markdown-body span.align-right {
 426    display: block;
 427    overflow: hidden;
 428    clear: both;
 429}
 430.markdown-body span.align-right>span {
 431    display: block;
 432    overflow: hidden;
 433    margin: 13px 0 0;
 434    text-align: right;
 435}
 436.markdown-body span.align-right span img {
 437    margin: 0;
 438    text-align: right;
 439}
 440.markdown-body span.float-left {
 441    display: block;
 442    margin-right: 13px;
 443    overflow: hidden;
 444    float: left;
 445}
 446.markdown-body span.float-left span {
 447    margin: 13px 0 0;
 448}
 449.markdown-body span.float-right {
 450    display: block;
 451    margin-left: 13px;
 452    overflow: hidden;
 453    float: right;
 454}
 455.markdown-body span.float-right>span {
 456    display: block;
 457    overflow: hidden;
 458    margin: 13px auto 0;
 459    text-align: right;
 460}
 461.markdown-body code, .markdown-body tt {
 462    margin: 0 2px;
 463    padding: 0px 5px;
 464    border: 1px solid #eaeaea;
 465    background-color: #f8f8f8;
 466    border-radius: 3px;
 467}
 468.markdown-body code {
 469    white-space: nowrap;
 470}
 471.markdown-body pre>code {
 472    margin: 0;
 473    padding: 0;
 474    white-space: pre;
 475    border: none;
 476    background: transparent;
 477}
 478.markdown-body .highlight pre, .markdown-body pre {
 479    background-color: #f8f8f8;
 480    border: 1px solid #ccc;
 481    font-size: 13px;
 482    line-height: 19px;
 483    overflow: auto;
 484    padding: 6px 10px;
 485    border-radius: 3px;
 486}
 487.markdown-body pre code, .markdown-body pre tt {
 488    margin: 0;
 489    padding: 0;
 490    background-color: transparent;
 491    border: none;
 492}
 493</style>
 494EOT
 495	print "<div class='markdown-body'>";
 496        print Markdown($text);
 497	print "</div>";
 498    }
 499}
 500
 501
 502
 503sub Markdown {
 504#
 505# Main function. The order in which other subs are called here is
 506# essential. Link and image substitutions need to happen before
 507# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 508# and <img> tags get encoded.
 509#
 510	my $text = shift;
 511
 512	# Clear the global hashes. If we don't clear these, you get conflicts
 513	# from other articles when generating a page which contains more than
 514	# one article (e.g. an index page that shows the N most recent
 515	# articles):
 516	%g_urls = ();
 517	%g_titles = ();
 518	%g_html_blocks = ();
 519
 520
 521	# Standardize line endings:
 522	$text =~ s{\r\n}{\n}g; 	# DOS to Unix
 523	$text =~ s{\r}{\n}g; 	# Mac to Unix
 524
 525	# Make sure $text ends with a couple of newlines:
 526	$text .= "\n\n";
 527
 528	# Convert all tabs to spaces.
 529	$text = _Detab($text);
 530
 531	# Strip any lines consisting only of spaces and tabs.
 532	# This makes subsequent regexen easier to write, because we can
 533	# match consecutive blank lines with /\n+/ instead of something
 534	# contorted like /[ \t]*\n+/ .
 535	$text =~ s/^[ \t]+$//mg;
 536
 537	# Turn block-level HTML blocks into hash entries
 538	$text = _HashHTMLBlocks($text);
 539
 540	# Strip link definitions, store in hashes.
 541	$text = _StripLinkDefinitions($text);
 542
 543	$text = _RunBlockGamut($text);
 544
 545	$text = _UnescapeSpecialChars($text);
 546
 547	return $text . "\n";
 548}
 549
 550
 551sub _StripLinkDefinitions {
 552#
 553# Strips link definitions from text, stores the URLs and titles in
 554# hash references.
 555#
 556	my $text = shift;
 557	my $less_than_tab = $g_tab_width - 1;
 558
 559	# Link defs are in the form: ^[id]: url "optional title"
 560	while ($text =~ s{
 561						^[ ]{0,$less_than_tab}\[(.+)\]:	# id = $1
 562						  [ \t]*
 563						  \n?				# maybe *one* newline
 564						  [ \t]*
 565						<?(\S+?)>?			# url = $2
 566						  [ \t]*
 567						  \n?				# maybe one newline
 568						  [ \t]*
 569						(?:
 570							(?<=\s)			# lookbehind for whitespace
 571							["(]
 572							(.+?)			# title = $3
 573							[")]
 574							[ \t]*
 575						)?	# title is optional
 576						(?:\n+|\Z)
 577					}
 578					{}mx) {
 579		$g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );	# Link IDs are case-insensitive
 580		if ($3) {
 581			$g_titles{lc $1} = $3;
 582			$g_titles{lc $1} =~ s/"/&quot;/g;
 583		}
 584	}
 585
 586	return $text;
 587}
 588
 589
 590sub _HashHTMLBlocks {
 591	my $text = shift;
 592	my $less_than_tab = $g_tab_width - 1;
 593
 594	# Hashify HTML blocks:
 595	# We only want to do this for block-level HTML tags, such as headers,
 596	# lists, and tables. That's because we still want to wrap <p>s around
 597	# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 598	# phrase emphasis, and spans. The list of tags we're looking for is
 599	# hard-coded:
 600	my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
 601	my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
 602
 603	# First, look for nested blocks, e.g.:
 604	# 	<div>
 605	# 		<div>
 606	# 		tags for inner block must be indented.
 607	# 		</div>
 608	# 	</div>
 609	#
 610	# The outermost tags must start at the left margin for this to match, and
 611	# the inner nested divs must be indented.
 612	# We need to do this before the next, more liberal match, because the next
 613	# match will start at the first `<div>` and stop at the first `</div>`.
 614	$text =~ s{
 615				(						# save in $1
 616					^					# start of line  (with /m)
 617					<($block_tags_a)	# start tag = $2
 618					\b					# word break
 619					(.*\n)*?			# any number of lines, minimally matching
 620					</\2>				# the matching end tag
 621					[ \t]*				# trailing spaces/tabs
 622					(?=\n+|\Z)	# followed by a newline or end of document
 623				)
 624			}{
 625				my $key = md5_hex($1);
 626				$g_html_blocks{$key} = $1;
 627				"\n\n" . $key . "\n\n";
 628			}egmx;
 629
 630
 631	#
 632	# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 633	#
 634	$text =~ s{
 635				(						# save in $1
 636					^					# start of line  (with /m)
 637					<($block_tags_b)	# start tag = $2
 638					\b					# word break
 639					(.*\n)*?			# any number of lines, minimally matching
 640					.*</\2>				# the matching end tag
 641					[ \t]*				# trailing spaces/tabs
 642					(?=\n+|\Z)	# followed by a newline or end of document
 643				)
 644			}{
 645				my $key = md5_hex($1);
 646				$g_html_blocks{$key} = $1;
 647				"\n\n" . $key . "\n\n";
 648			}egmx;
 649	# Special case just for <hr />. It was easier to make a special case than
 650	# to make the other regex more complicated.	
 651	$text =~ s{
 652				(?:
 653					(?<=\n\n)		# Starting after a blank line
 654					|				# or
 655					\A\n?			# the beginning of the doc
 656				)
 657				(						# save in $1
 658					[ ]{0,$less_than_tab}
 659					<(hr)				# start tag = $2
 660					\b					# word break
 661					([^<>])*?			# 
 662					/?>					# the matching end tag
 663					[ \t]*
 664					(?=\n{2,}|\Z)		# followed by a blank line or end of document
 665				)
 666			}{
 667				my $key = md5_hex($1);
 668				$g_html_blocks{$key} = $1;
 669				"\n\n" . $key . "\n\n";
 670			}egx;
 671
 672	# Special case for standalone HTML comments:
 673	$text =~ s{
 674				(?:
 675					(?<=\n\n)		# Starting after a blank line
 676					|				# or
 677					\A\n?			# the beginning of the doc
 678				)
 679				(						# save in $1
 680					[ ]{0,$less_than_tab}
 681					(?s:
 682						<!
 683						(--.*?--\s*)+
 684						>
 685					)
 686					[ \t]*
 687					(?=\n{2,}|\Z)		# followed by a blank line or end of document
 688				)
 689			}{
 690				my $key = md5_hex($1);
 691				$g_html_blocks{$key} = $1;
 692				"\n\n" . $key . "\n\n";
 693			}egx;
 694
 695
 696	return $text;
 697}
 698
 699
 700sub _RunBlockGamut {
 701#
 702# These are all the transformations that form block-level
 703# tags like paragraphs, headers, and list items.
 704#
 705	my $text = shift;
 706
 707	$text = _DoHeaders($text);
 708
 709	# Do Horizontal Rules:
 710	$text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 711	$text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 712	$text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 713
 714	$text = _DoLists($text);
 715
 716	$text = _DoCodeBlocks($text);
 717
 718	$text = _DoBlockQuotes($text);
 719
 720	# We already ran _HashHTMLBlocks() before, in Markdown(), but that
 721	# was to escape raw HTML in the original Markdown source. This time,
 722	# we're escaping the markup we've just created, so that we don't wrap
 723	# <p> tags around block-level tags.
 724	$text = _HashHTMLBlocks($text);
 725
 726	$text = _FormParagraphs($text);
 727
 728	return $text;
 729}
 730
 731
 732sub _RunSpanGamut {
 733#
 734# These are all the transformations that occur *within* block-level
 735# tags like paragraphs, headers, and list items.
 736#
 737	my $text = shift;
 738
 739	$text = _DoCodeSpans($text);
 740
 741	$text = _EscapeSpecialChars($text);
 742
 743	# Process anchor and image tags. Images must come first,
 744	# because ![foo][f] looks like an anchor.
 745	$text = _DoImages($text);
 746	$text = _DoAnchors($text);
 747
 748	# Make links out of things like `<http://example.com/>`
 749	# Must come after _DoAnchors(), because you can use < and >
 750	# delimiters in inline links like [this](<url>).
 751	$text = _DoAutoLinks($text);
 752
 753	$text = _EncodeAmpsAndAngles($text);
 754
 755	$text = _DoItalicsAndBold($text);
 756
 757	# Do hard breaks:
 758	$text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
 759
 760	return $text;
 761}
 762
 763
 764sub _EscapeSpecialChars {
 765	my $text = shift;
 766	my $tokens ||= _TokenizeHTML($text);
 767
 768	$text = '';   # rebuild $text from the tokens
 769# 	my $in_pre = 0;	 # Keep track of when we're inside <pre> or <code> tags.
 770# 	my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
 771
 772	foreach my $cur_token (@$tokens) {
 773		if ($cur_token->[0] eq "tag") {
 774			# Within tags, encode * and _ so they don't conflict
 775			# with their use in Markdown for italics and strong.
 776			# We're replacing each such character with its
 777			# corresponding MD5 checksum value; this is likely
 778			# overkill, but it should prevent us from colliding
 779			# with the escape values by accident.
 780			$cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
 781			$cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
 782			$text .= $cur_token->[1];
 783		} else {
 784			my $t = $cur_token->[1];
 785			$t = _EncodeBackslashEscapes($t);
 786			$text .= $t;
 787		}
 788	}
 789	return $text;
 790}
 791
 792
 793sub _DoAnchors {
 794#
 795# Turn Markdown link shortcuts into XHTML <a> tags.
 796#
 797	my $text = shift;
 798
 799	#
 800	# First, handle reference-style links: [link text] [id]
 801	#
 802	$text =~ s{
 803		(					# wrap whole match in $1
 804		  \[
 805		    ($g_nested_brackets)	# link text = $2
 806		  \]
 807
 808		  [ ]?				# one optional space
 809		  (?:\n[ ]*)?		# one optional newline followed by spaces
 810
 811		  \[
 812		    (.*?)		# id = $3
 813		  \]
 814		)
 815	}{
 816		my $result;
 817		my $whole_match = $1;
 818		my $link_text   = $2;
 819		my $link_id     = lc $3;
 820
 821		if ($link_id eq "") {
 822			$link_id = lc $link_text;     # for shortcut links like [this][].
 823		}
 824
 825		if (defined $g_urls{$link_id}) {
 826			my $url = $g_urls{$link_id};
 827			$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
 828			$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
 829			$result = "<a href=\"$url\"";
 830			if ( defined $g_titles{$link_id} ) {
 831				my $title = $g_titles{$link_id};
 832				$title =~ s! \* !$g_escape_table{'*'}!gx;
 833				$title =~ s!  _ !$g_escape_table{'_'}!gx;
 834				$result .=  " title=\"$title\"";
 835			}
 836			$result .= ">$link_text</a>";
 837		}
 838		else {
 839			$result = $whole_match;
 840		}
 841		$result;
 842	}xsge;
 843
 844	#
 845	# Next, inline-style links: [link text](url "optional title")
 846	#
 847	$text =~ s{
 848		(				# wrap whole match in $1
 849		  \[
 850		    ($g_nested_brackets)	# link text = $2
 851		  \]
 852		  \(			# literal paren
 853		  	[ \t]*
 854			<?(.*?)>?	# href = $3
 855		  	[ \t]*
 856			(			# $4
 857			  (['"])	# quote char = $5
 858			  (.*?)		# Title = $6
 859			  \5		# matching quote
 860			)?			# title is optional
 861		  \)
 862		)
 863	}{
 864		my $result;
 865		my $whole_match = $1;
 866		my $link_text   = $2;
 867		my $url	  		= $3;
 868		my $title		= $6;
 869
 870		$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
 871		$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
 872		$result = "<a href=\"$url\"";
 873
 874		if (defined $title) {
 875			$title =~ s/"/&quot;/g;
 876			$title =~ s! \* !$g_escape_table{'*'}!gx;
 877			$title =~ s!  _ !$g_escape_table{'_'}!gx;
 878			$result .=  " title=\"$title\"";
 879		}
 880
 881		$result .= ">$link_text</a>";
 882
 883		$result;
 884	}xsge;
 885
 886	return $text;
 887}
 888
 889
 890sub _DoImages {
 891#
 892# Turn Markdown image shortcuts into <img> tags.
 893#
 894	my $text = shift;
 895
 896	#
 897	# First, handle reference-style labeled images: ![alt text][id]
 898	#
 899	$text =~ s{
 900		(				# wrap whole match in $1
 901		  !\[
 902		    (.*?)		# alt text = $2
 903		  \]
 904
 905		  [ ]?				# one optional space
 906		  (?:\n[ ]*)?		# one optional newline followed by spaces
 907
 908		  \[
 909		    (.*?)		# id = $3
 910		  \]
 911
 912		)
 913	}{
 914		my $result;
 915		my $whole_match = $1;
 916		my $alt_text    = $2;
 917		my $link_id     = lc $3;
 918
 919		if ($link_id eq "") {
 920			$link_id = lc $alt_text;     # for shortcut links like ![this][].
 921		}
 922
 923		$alt_text =~ s/"/&quot;/g;
 924		if (defined $g_urls{$link_id}) {
 925			my $url = $g_urls{$link_id};
 926			$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
 927			$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
 928			$result = "<img src=\"$url\" alt=\"$alt_text\"";
 929			if (defined $g_titles{$link_id}) {
 930				my $title = $g_titles{$link_id};
 931				$title =~ s! \* !$g_escape_table{'*'}!gx;
 932				$title =~ s!  _ !$g_escape_table{'_'}!gx;
 933				$result .=  " title=\"$title\"";
 934			}
 935			$result .= $g_empty_element_suffix;
 936		}
 937		else {
 938			# If there's no such link ID, leave intact:
 939			$result = $whole_match;
 940		}
 941
 942		$result;
 943	}xsge;
 944
 945	#
 946	# Next, handle inline images:  ![alt text](url "optional title")
 947	# Don't forget: encode * and _
 948
 949	$text =~ s{
 950		(				# wrap whole match in $1
 951		  !\[
 952		    (.*?)		# alt text = $2
 953		  \]
 954		  \(			# literal paren
 955		  	[ \t]*
 956			<?(\S+?)>?	# src url = $3
 957		  	[ \t]*
 958			(			# $4
 959			  (['"])	# quote char = $5
 960			  (.*?)		# title = $6
 961			  \5		# matching quote
 962			  [ \t]*
 963			)?			# title is optional
 964		  \)
 965		)
 966	}{
 967		my $result;
 968		my $whole_match = $1;
 969		my $alt_text    = $2;
 970		my $url	  		= $3;
 971		my $title		= '';
 972		if (defined($6)) {
 973			$title		= $6;
 974		}
 975
 976		$alt_text =~ s/"/&quot;/g;
 977		$title    =~ s/"/&quot;/g;
 978		$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
 979		$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
 980		$result = "<img src=\"$url\" alt=\"$alt_text\"";
 981		if (defined $title) {
 982			$title =~ s! \* !$g_escape_table{'*'}!gx;
 983			$title =~ s!  _ !$g_escape_table{'_'}!gx;
 984			$result .=  " title=\"$title\"";
 985		}
 986		$result .= $g_empty_element_suffix;
 987
 988		$result;
 989	}xsge;
 990
 991	return $text;
 992}
 993
 994
 995sub _DoHeaders {
 996	my $text = shift;
 997
 998	# Setext-style headers:
 999	#	  Header 1
1000	#	  ========
1001	#  
1002	#	  Header 2
1003	#	  --------
1004	#
1005	$text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
1006		"<h1>"  .  _RunSpanGamut($1)  .  "</h1>\n\n";
1007	}egmx;
1008
1009	$text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
1010		"<h2>"  .  _RunSpanGamut($1)  .  "</h2>\n\n";
1011	}egmx;
1012
1013
1014	# atx-style headers:
1015	#	# Header 1
1016	#	## Header 2
1017	#	## Header 2 with closing hashes ##
1018	#	...
1019	#	###### Header 6
1020	#
1021	$text =~ s{
1022			^(\#{1,6})	# $1 = string of #'s
1023			[ \t]*
1024			(.+?)		# $2 = Header text
1025			[ \t]*
1026			\#*			# optional closing #'s (not counted)
1027			\n+
1028		}{
1029			my $h_level = length($1);
1030			"<h$h_level>"  .  _RunSpanGamut($2)  .  "</h$h_level>\n\n";
1031		}egmx;
1032
1033	return $text;
1034}
1035
1036
1037sub _DoLists {
1038#
1039# Form HTML ordered (numbered) and unordered (bulleted) lists.
1040#
1041	my $text = shift;
1042	my $less_than_tab = $g_tab_width - 1;
1043
1044	# Re-usable patterns to match list item bullets and number markers:
1045	my $marker_ul  = qr/[*+-]/;
1046	my $marker_ol  = qr/\d+[.]/;
1047	my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
1048
1049	# Re-usable pattern to match any entirel ul or ol list:
1050	my $whole_list = qr{
1051		(								# $1 = whole list
1052		  (								# $2
1053			[ ]{0,$less_than_tab}
1054			(${marker_any})				# $3 = first list item marker
1055			[ \t]+
1056		  )
1057		  (?s:.+?)
1058		  (								# $4
1059			  \z
1060			|
1061			  \n{2,}
1062			  (?=\S)
1063			  (?!						# Negative lookahead for another list item marker
1064				[ \t]*
1065				${marker_any}[ \t]+
1066			  )
1067		  )
1068		)
1069	}mx;
1070
1071	# We use a different prefix before nested lists than top-level lists.
1072	# See extended comment in _ProcessListItems().
1073	#
1074	# Note: There's a bit of duplication here. My original implementation
1075	# created a scalar regex pattern as the conditional result of the test on
1076	# $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1077	# substitution once, using the scalar as the pattern. This worked,
1078	# everywhere except when running under MT on my hosting account at Pair
1079	# Networks. There, this caused all rebuilds to be killed by the reaper (or
1080	# perhaps they crashed, but that seems incredibly unlikely given that the
1081	# same script on the same server ran fine *except* under MT. I've spent
1082	# more time trying to figure out why this is happening than I'd like to
1083	# admit. My only guess, backed up by the fact that this workaround works,
1084	# is that Perl optimizes the substition when it can figure out that the
1085	# pattern will never change, and when this optimization isn't on, we run
1086	# afoul of the reaper. Thus, the slightly redundant code to that uses two
1087	# static s/// patterns rather than one conditional pattern.
1088
1089	if ($g_list_level) {
1090		$text =~ s{
1091				^
1092				$whole_list
1093			}{
1094				my $list = $1;
1095				my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1096				# Turn double returns into triple returns, so that we can make a
1097				# paragraph for the last item in a list, if necessary:
1098				$list =~ s/\n{2,}/\n\n\n/g;
1099				my $result = _ProcessListItems($list, $marker_any);
1100				$result = "<$list_type>\n" . $result . "</$list_type>\n";
1101				$result;
1102			}egmx;
1103	}
1104	else {
1105		$text =~ s{
1106				(?:(?<=\n\n)|\A\n?)
1107				$whole_list
1108			}{
1109				my $list = $1;
1110				my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1111				# Turn double returns into triple returns, so that we can make a
1112				# paragraph for the last item in a list, if necessary:
1113				$list =~ s/\n{2,}/\n\n\n/g;
1114				my $result = _ProcessListItems($list, $marker_any);
1115				$result = "<$list_type>\n" . $result . "</$list_type>\n";
1116				$result;
1117			}egmx;
1118	}
1119
1120
1121	return $text;
1122}
1123
1124
1125sub _ProcessListItems {
1126#
1127#	Process the contents of a single ordered or unordered list, splitting it
1128#	into individual list items.
1129#
1130
1131	my $list_str = shift;
1132	my $marker_any = shift;
1133
1134
1135	# The $g_list_level global keeps track of when we're inside a list.
1136	# Each time we enter a list, we increment it; when we leave a list,
1137	# we decrement. If it's zero, we're not in a list anymore.
1138	#
1139	# We do this because when we're not inside a list, we want to treat
1140	# something like this:
1141	#
1142	#		I recommend upgrading to version
1143	#		8. Oops, now this line is treated
1144	#		as a sub-list.
1145	#
1146	# As a single paragraph, despite the fact that the second line starts
1147	# with a digit-period-space sequence.
1148	#
1149	# Whereas when we're inside a list (or sub-list), that line will be
1150	# treated as the start of a sub-list. What a kludge, huh? This is
1151	# an aspect of Markdown's syntax that's hard to parse perfectly
1152	# without resorting to mind-reading. Perhaps the solution is to
1153	# change the syntax rules such that sub-lists must start with a
1154	# starting cardinal number; e.g. "1." or "a.".
1155
1156	$g_list_level++;
1157
1158	# trim trailing blank lines:
1159	$list_str =~ s/\n{2,}\z/\n/;
1160
1161
1162	$list_str =~ s{
1163		(\n)?							# leading line = $1
1164		(^[ \t]*)						# leading whitespace = $2
1165		($marker_any) [ \t]+			# list marker = $3
1166		((?s:.+?)						# list item text   = $4
1167		(\n{1,2}))
1168		(?= \n* (\z | \2 ($marker_any) [ \t]+))
1169	}{
1170		my $item = $4;
1171		my $leading_line = $1;
1172		my $leading_space = $2;
1173
1174		if ($leading_line or ($item =~ m/\n{2,}/)) {
1175			$item = _RunBlockGamut(_Outdent($item));
1176		}
1177		else {
1178			# Recursion for sub-lists:
1179			$item = _DoLists(_Outdent($item));
1180			chomp $item;
1181			$item = _RunSpanGamut($item);
1182		}
1183
1184		"<li>" . $item . "</li>\n";
1185	}egmx;
1186
1187	$g_list_level--;
1188	return $list_str;
1189}
1190
1191
1192
1193sub _DoCodeBlocks {
1194#
1195#	Process Markdown `<pre><code>` blocks.
1196#	
1197
1198	my $text = shift;
1199
1200	$text =~ s{
1201			(?:\n\n|\A)
1202			(	            # $1 = the code block -- one or more lines, starting with a space/tab
1203			  (?:
1204			    (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
1205			    .*\n+
1206			  )+
1207			)
1208			((?=^[ ]{0,$g_tab_width}\S)|\Z)	# Lookahead for non-space at line-start, or end of doc
1209		}{
1210			my $codeblock = $1;
1211			my $result; # return value
1212
1213			$codeblock = _EncodeCode(_Outdent($codeblock));
1214			$codeblock = _Detab($codeblock);
1215			$codeblock =~ s/\A\n+//; # trim leading newlines
1216			$codeblock =~ s/\s+\z//; # trim trailing whitespace
1217
1218			$result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1219
1220			$result;
1221		}egmx;
1222
1223	return $text;
1224}
1225
1226
1227sub _DoCodeSpans {
1228#
1229# 	*	Backtick quotes are used for <code></code> spans.
1230# 
1231# 	*	You can use multiple backticks as the delimiters if you want to
1232# 		include literal backticks in the code span. So, this input:
1233#     
1234#         Just type ``foo `bar` baz`` at the prompt.
1235#     
1236#     	Will translate to:
1237#     
1238#         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1239#     
1240#		There's no arbitrary limit to the number of backticks you
1241#		can use as delimters. If you need three consecutive backticks
1242#		in your code, use four for delimiters, etc.
1243#
1244#	*	You can use spaces to get literal backticks at the edges:
1245#     
1246#         ... type `` `bar` `` ...
1247#     
1248#     	Turns to:
1249#     
1250#         ... type <code>`bar`</code> ...
1251#
1252
1253	my $text = shift;
1254
1255	$text =~ s@
1256			(`+)		# $1 = Opening run of `
1257			(.+?)		# $2 = The code block
1258			(?<!`)
1259			\1			# Matching closer
1260			(?!`)
1261		@
1262 			my $c = "$2";
1263 			$c =~ s/^[ \t]*//g; # leading whitespace
1264 			$c =~ s/[ \t]*$//g; # trailing whitespace
1265 			$c = _EncodeCode($c);
1266			"<code>$c</code>";
1267		@egsx;
1268
1269	return $text;
1270}
1271
1272
1273sub _EncodeCode {
1274#
1275# Encode/escape certain characters inside Markdown code runs.
1276# The point is that in code, these characters are literals,
1277# and lose their special Markdown meanings.
1278#
1279    local $_ = shift;
1280
1281	# Encode all ampersands; HTML entities are not
1282	# entities within a Markdown code span.
1283	s/&/&amp;/g;
1284
1285	# Encode $'s, but only if we're running under Blosxom.
1286	# (Blosxom interpolates Perl variables in article bodies.)
1287	{
1288		no warnings 'once';
1289    	if (defined($blosxom::version)) {
1290    		s/\$/&#036;/g;	
1291    	}
1292    }
1293
1294
1295	# Do the angle bracket song and dance:
1296	s! <  !&lt;!gx;
1297	s! >  !&gt;!gx;
1298
1299	# Now, escape characters that are magic in Markdown:
1300	s! \* !$g_escape_table{'*'}!gx;
1301	s! _  !$g_escape_table{'_'}!gx;
1302	s! {  !$g_escape_table{'{'}!gx;
1303	s! }  !$g_escape_table{'}'}!gx;
1304	s! \[ !$g_escape_table{'['}!gx;
1305	s! \] !$g_escape_table{']'}!gx;
1306	s! \\ !$g_escape_table{'\\'}!gx;
1307
1308	return $_;
1309}
1310
1311
1312sub _DoItalicsAndBold {
1313	my $text = shift;
1314
1315	# <strong> must go first:
1316	$text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1317		{<strong>$2</strong>}gsx;
1318
1319	$text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1320		{<em>$2</em>}gsx;
1321
1322	return $text;
1323}
1324
1325
1326sub _DoBlockQuotes {
1327	my $text = shift;
1328
1329	$text =~ s{
1330		  (								# Wrap whole match in $1
1331			(
1332			  ^[ \t]*>[ \t]?			# '>' at the start of a line
1333			    .+\n					# rest of the first line
1334			  (.+\n)*					# subsequent consecutive lines
1335			  \n*						# blanks
1336			)+
1337		  )
1338		}{
1339			my $bq = $1;
1340			$bq =~ s/^[ \t]*>[ \t]?//gm;	# trim one level of quoting
1341			$bq =~ s/^[ \t]+$//mg;			# trim whitespace-only lines
1342			$bq = _RunBlockGamut($bq);		# recurse
1343
1344			$bq =~ s/^/  /g;
1345			# These leading spaces screw with <pre> content, so we need to fix that:
1346			$bq =~ s{
1347					(\s*<pre>.+?</pre>)
1348				}{
1349					my $pre = $1;
1350					$pre =~ s/^  //mg;
1351					$pre;
1352				}egsx;
1353
1354			"<blockquote>\n$bq\n</blockquote>\n\n";
1355		}egmx;
1356
1357
1358	return $text;
1359}
1360
1361
1362sub _FormParagraphs {
1363#
1364#	Params:
1365#		$text - string to process with html <p> tags
1366#
1367	my $text = shift;
1368
1369	# Strip leading and trailing lines:
1370	$text =~ s/\A\n+//;
1371	$text =~ s/\n+\z//;
1372
1373	my @grafs = split(/\n{2,}/, $text);
1374
1375	#
1376	# Wrap <p> tags.
1377	#
1378	foreach (@grafs) {
1379		unless (defined( $g_html_blocks{$_} )) {
1380			$_ = _RunSpanGamut($_);
1381			s/^([ \t]*)/<p>/;
1382			$_ .= "</p>";
1383		}
1384	}
1385
1386	#
1387	# Unhashify HTML blocks
1388	#
1389	foreach (@grafs) {
1390		if (defined( $g_html_blocks{$_} )) {
1391			$_ = $g_html_blocks{$_};
1392		}
1393	}
1394
1395	return join "\n\n", @grafs;
1396}
1397
1398
1399sub _EncodeAmpsAndAngles {
1400# Smart processing for ampersands and angle brackets that need to be encoded.
1401
1402	my $text = shift;
1403
1404	# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1405	#   http://bumppo.net/projects/amputator/
1406 	$text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1407
1408	# Encode naked <'s
1409 	$text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1410
1411	return $text;
1412}
1413
1414
1415sub _EncodeBackslashEscapes {
1416#
1417#   Parameter:  String.
1418#   Returns:    The string, with after processing the following backslash
1419#               escape sequences.
1420#
1421    local $_ = shift;
1422
1423    s! \\\\  !$g_escape_table{'\\'}!gx;		# Must process escaped backslashes first.
1424    s! \\`   !$g_escape_table{'`'}!gx;
1425    s! \\\*  !$g_escape_table{'*'}!gx;
1426    s! \\_   !$g_escape_table{'_'}!gx;
1427    s! \\\{  !$g_escape_table{'{'}!gx;
1428    s! \\\}  !$g_escape_table{'}'}!gx;
1429    s! \\\[  !$g_escape_table{'['}!gx;
1430    s! \\\]  !$g_escape_table{']'}!gx;
1431    s! \\\(  !$g_escape_table{'('}!gx;
1432    s! \\\)  !$g_escape_table{')'}!gx;
1433    s! \\>   !$g_escape_table{'>'}!gx;
1434    s! \\\#  !$g_escape_table{'#'}!gx;
1435    s! \\\+  !$g_escape_table{'+'}!gx;
1436    s! \\\-  !$g_escape_table{'-'}!gx;
1437    s! \\\.  !$g_escape_table{'.'}!gx;
1438    s{ \\!  }{$g_escape_table{'!'}}gx;
1439
1440    return $_;
1441}
1442
1443
1444sub _DoAutoLinks {
1445	my $text = shift;
1446
1447	$text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1448
1449	# Email addresses: <address@domain.foo>
1450	$text =~ s{
1451		<
1452        (?:mailto:)?
1453		(
1454			[-.\w]+
1455			\@
1456			[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1457		)
1458		>
1459	}{
1460		_EncodeEmailAddress( _UnescapeSpecialChars($1) );
1461	}egix;
1462
1463	return $text;
1464}
1465
1466
1467sub _EncodeEmailAddress {
1468#
1469#	Input: an email address, e.g. "foo@example.com"
1470#
1471#	Output: the email address as a mailto link, with each character
1472#		of the address encoded as either a decimal or hex entity, in
1473#		the hopes of foiling most address harvesting spam bots. E.g.:
1474#
1475#	  <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1476#       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1477#       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1478#
1479#	Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1480#	mailing list: <http://tinyurl.com/yu7ue>
1481#
1482
1483	my $addr = shift;
1484
1485	srand;
1486	my @encode = (
1487		sub { '&#' .                 ord(shift)   . ';' },
1488		sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1489		sub {                            shift          },
1490	);
1491
1492	$addr = "mailto:" . $addr;
1493
1494	$addr =~ s{(.)}{
1495		my $char = $1;
1496		if ( $char eq '@' ) {
1497			# this *must* be encoded. I insist.
1498			$char = $encode[int rand 1]->($char);
1499		} elsif ( $char ne ':' ) {
1500			# leave ':' alone (to spot mailto: later)
1501			my $r = rand;
1502			# roughly 10% raw, 45% hex, 45% dec
1503			$char = (
1504				$r > .9   ?  $encode[2]->($char)  :
1505				$r < .45  ?  $encode[1]->($char)  :
1506							 $encode[0]->($char)
1507			);
1508		}
1509		$char;
1510	}gex;
1511
1512	$addr = qq{<a href="$addr">$addr</a>};
1513	$addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1514
1515	return $addr;
1516}
1517
1518
1519sub _UnescapeSpecialChars {
1520#
1521# Swap back in all the special characters we've hidden.
1522#
1523	my $text = shift;
1524
1525	while( my($char, $hash) = each(%g_escape_table) ) {
1526		$text =~ s/$hash/$char/g;
1527	}
1528    return $text;
1529}
1530
1531
1532sub _TokenizeHTML {
1533#
1534#   Parameter:  String containing HTML markup.
1535#   Returns:    Reference to an array of the tokens comprising the input
1536#               string. Each token is either a tag (possibly with nested,
1537#               tags contained therein, such as <a href="<MTFoo>">, or a
1538#               run of text between tags. Each element of the array is a
1539#               two-element array; the first is either 'tag' or 'text';
1540#               the second is the actual value.
1541#
1542#
1543#   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1544#       <http://www.bradchoate.com/past/mtregex.php>
1545#
1546
1547    my $str = shift;
1548    my $pos = 0;
1549    my $len = length $str;
1550    my @tokens;
1551
1552    my $depth = 6;
1553    my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1554    my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1555                   (?s: <\? .*? \?> ) |              # processing instruction
1556                   $nested_tags/ix;                   # nested tags
1557
1558    while ($str =~ m/($match)/g) {
1559        my $whole_tag = $1;
1560        my $sec_start = pos $str;
1561        my $tag_start = $sec_start - length $whole_tag;
1562        if ($pos < $tag_start) {
1563            push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1564        }
1565        push @tokens, ['tag', $whole_tag];
1566        $pos = pos $str;
1567    }
1568    push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1569    \@tokens;
1570}
1571
1572
1573sub _Outdent {
1574#
1575# Remove one level of line-leading tabs or spaces
1576#
1577	my $text = shift;
1578
1579	$text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1580	return $text;
1581}
1582
1583
1584sub _Detab {
1585#
1586# Cribbed from a post by Bart Lateur:
1587# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1588#
1589	my $text = shift;
1590
1591	$text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1592	return $text;
1593}
1594
1595
15961;
1597
1598__END__
1599
1600
1601=pod
1602
1603=head1 NAME
1604
1605B<Markdown>
1606
1607
1608=head1 SYNOPSIS
1609
1610B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1611    [ I<file> ... ]
1612
1613
1614=head1 DESCRIPTION
1615
1616Markdown is a text-to-HTML filter; it translates an easy-to-read /
1617easy-to-write structured text format into HTML. Markdown's text format
1618is most similar to that of plain text email, and supports features such
1619as headers, *emphasis*, code blocks, blockquotes, and links.
1620
1621Markdown's syntax is designed not as a generic markup language, but
1622specifically to serve as a front-end to (X)HTML. You can  use span-level
1623HTML tags anywhere in a Markdown document, and you can use block level
1624HTML tags (like <div> and <table> as well).
1625
1626For more information about Markdown's syntax, see:
1627
1628    http://daringfireball.net/projects/markdown/
1629
1630
1631=head1 OPTIONS
1632
1633Use "--" to end switch parsing. For example, to open a file named "-z", use:
1634
1635	Markdown.pl -- -z
1636
1637=over 4
1638
1639
1640=item B<--html4tags>
1641
1642Use HTML 4 style for empty element tags, e.g.:
1643
1644    <br>
1645
1646instead of Markdown's default XHTML style tags, e.g.:
1647
1648    <br />
1649
1650
1651=item B<-v>, B<--version>
1652
1653Display Markdown's version number and copyright information.
1654
1655
1656=item B<-s>, B<--shortversion>
1657
1658Display the short-form version number.
1659
1660
1661=back
1662
1663
1664
1665=head1 BUGS
1666
1667To file bug reports or feature requests (other than topics listed in the
1668Caveats section above) please send email to:
1669
1670    support@daringfireball.net
1671
1672Please include with your report: (1) the example input; (2) the output
1673you expected; (3) the output Markdown actually produced.
1674
1675
1676=head1 VERSION HISTORY
1677
1678See the readme file for detailed release notes for this version.
1679
16801.0.1 - 14 Dec 2004
1681
16821.0 - 28 Aug 2004
1683
1684
1685=head1 AUTHOR
1686
1687    John Gruber
1688    http://daringfireball.net
1689
1690    PHP port and other contributions by Michel Fortin
1691    http://michelf.com
1692
1693
1694=head1 COPYRIGHT AND LICENSE
1695
1696Copyright (c) 2003-2004 John Gruber   
1697<http://daringfireball.net/>   
1698All rights reserved.
1699
1700Redistribution and use in source and binary forms, with or without
1701modification, are permitted provided that the following conditions are
1702met:
1703
1704* Redistributions of source code must retain the above copyright notice,
1705  this list of conditions and the following disclaimer.
1706
1707* Redistributions in binary form must reproduce the above copyright
1708  notice, this list of conditions and the following disclaimer in the
1709  documentation and/or other materials provided with the distribution.
1710
1711* Neither the name "Markdown" nor the names of its contributors may
1712  be used to endorse or promote products derived from this software
1713  without specific prior written permission.
1714
1715This software is provided by the copyright holders and contributors "as
1716is" and any express or implied warranties, including, but not limited
1717to, the implied warranties of merchantability and fitness for a
1718particular purpose are disclaimed. In no event shall the copyright owner
1719or contributors be liable for any direct, indirect, incidental, special,
1720exemplary, or consequential damages (including, but not limited to,
1721procurement of substitute goods or services; loss of use, data, or
1722profits; or business interruption) however caused and on any theory of
1723liability, whether in contract, strict liability, or tort (including
1724negligence or otherwise) arising in any way out of the use of this
1725software, even if advised of the possibility of such damage.
1726
1727=cut