diff options
author | Joey Hess <joey@kitenet.net> | 2010-11-16 16:57:50 -0400 |
---|---|---|
committer | Joey Hess <joey@kitenet.net> | 2010-11-16 16:57:50 -0400 |
commit | b00c6c9640453bf1407c4e880ef0c171388197c7 (patch) | |
tree | 24fe57766a1522af6a86dcc76df71a015c04e2e3 | |
parent | fcf0ee574a974ca482509da595cdf9196a3afbb3 (diff) | |
download | ikiwiki-b00c6c9640453bf1407c4e880ef0c171388197c7.tar ikiwiki-b00c6c9640453bf1407c4e880ef0c171388197c7.tar.gz |
inline: Improve RSS url munging to use a proper html parser
and support all elements that HTML::Tagset knows about.
(Which doesn't include html5 just yet, but then the old version didn't either.)
Bonus: 4 times faster than old regexp method.
-rw-r--r-- | IkiWiki/Plugin/inline.pm | 63 | ||||
-rw-r--r-- | debian/changelog | 4 |
2 files changed, 52 insertions, 15 deletions
diff --git a/IkiWiki/Plugin/inline.pm b/IkiWiki/Plugin/inline.pm index 3b98bf8dd..1fe40a5ea 100644 --- a/IkiWiki/Plugin/inline.pm +++ b/IkiWiki/Plugin/inline.pm @@ -506,26 +506,59 @@ sub date_822 ($) { } sub absolute_urls ($$) { - # sucky sub because rss sucks - my $content=shift; + # needed because rss sucks + my $html=shift; my $baseurl=shift; my $url=$baseurl; $url=~s/[^\/]+$//; + my $urltop; # calculated if needed - # what is the non path part of the url? - my $top_uri = URI->new($url); - $top_uri->path_query(""); # reset the path - my $urltop = $top_uri->as_string; - - $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(#[^"]+)"/$1 href="$baseurl$2"/mig; - # relative to another wiki page - $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)([^\/][^"]*)"/$1 href="$url$2"/mig; - $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)([^\/][^"]*)"/$1 src="$url$2"/mig; - # relative to the top of the site - $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)(\/[^"]*)"/$1 href="$urltop$2"/mig; - $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)(\/[^"]*)"/$1 src="$urltop$2"/mig; - return $content; + my $ret=""; + + eval q{use HTML::Parser; use HTML::Tagset}; + die $@ if $@; + my $p = HTML::Parser->new(api_version => 3); + $p->handler(default => sub { $ret.=join("", @_) }, "text"); + $p->handler(start => sub { + my ($tagname, $pos, $text) = @_; + if (ref $HTML::Tagset::linkElements{$tagname}) { + while (4 <= @$pos) { + # use attribute sets from right to left + # to avoid invalidating the offsets + # when replacing the values + my ($k_offset, $k_len, $v_offset, $v_len) = + splice(@$pos, -4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}}; + next unless $v_offset; # 0 v_offset means no value + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + if ($v=~/^#/) { + $v=$baseurl.$v; # anchor + } + elsif ($v=~/^(?!\w+:)[^\/]/) { + $v=$url.$v; # relative url + } + elsif ($v=~/^\//) { + if (! defined $urltop) { + # what is the non path part of the url? + my $top_uri = URI->new($url); + $top_uri->path_query(""); # reset the path + $urltop = $top_uri->as_string; + } + $v=$urltop.$v; # url relative to top of site + } + $v =~ s/\"/"/g; # since we quote with "" + substr($text, $v_offset, $v_len) = qq("$v"); + } + } + $ret.=$text; + }, "tagname, tokenpos, text"); + $p->parse($html); + $p->eof; + + return $ret; } sub genfeed ($$$$$@) { diff --git a/debian/changelog b/debian/changelog index 00c32e95d..0edb78004 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,10 @@ ikiwiki (3.20101113) UNRELEASED; urgency=low * more: Add pages parameter to limit where the more is displayed. (thanks, dark) * Fix escaping of filenames in historyurl. (Thanks, aj) + * inline: Improve RSS url munging to use a proper html parser, + and support all elements that HTML::Tagset knows about. + (Which doesn't include html5 just yet, but then the old version + didn't either.) Bonus: 4 times faster than old regexp method. -- Joey Hess <joeyh@debian.org> Tue, 16 Nov 2010 14:23:47 -0400 |