aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Hess <joey@kitenet.net>2010-11-16 16:57:50 -0400
committerJoey Hess <joey@kitenet.net>2010-11-16 16:57:50 -0400
commitb00c6c9640453bf1407c4e880ef0c171388197c7 (patch)
tree24fe57766a1522af6a86dcc76df71a015c04e2e3
parentfcf0ee574a974ca482509da595cdf9196a3afbb3 (diff)
downloadikiwiki-b00c6c9640453bf1407c4e880ef0c171388197c7.tar
ikiwiki-b00c6c9640453bf1407c4e880ef0c171388197c7.tar.gz
inline: Improve RSS url munging to use a proper html parser
and support all elements that HTML::Tagset knows about. (Which doesn't include html5 just yet, but then the old version didn't either.) Bonus: 4 times faster than old regexp method.
-rw-r--r--IkiWiki/Plugin/inline.pm63
-rw-r--r--debian/changelog4
2 files changed, 52 insertions, 15 deletions
diff --git a/IkiWiki/Plugin/inline.pm b/IkiWiki/Plugin/inline.pm
index 3b98bf8dd..1fe40a5ea 100644
--- a/IkiWiki/Plugin/inline.pm
+++ b/IkiWiki/Plugin/inline.pm
@@ -506,26 +506,59 @@ sub date_822 ($) {
}
sub absolute_urls ($$) {
- # sucky sub because rss sucks
- my $content=shift;
+ # needed because rss sucks
+ my $html=shift;
my $baseurl=shift;
my $url=$baseurl;
$url=~s/[^\/]+$//;
+ my $urltop; # calculated if needed
- # what is the non path part of the url?
- my $top_uri = URI->new($url);
- $top_uri->path_query(""); # reset the path
- my $urltop = $top_uri->as_string;
-
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(#[^"]+)"/$1 href="$baseurl$2"/mig;
- # relative to another wiki page
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)([^\/][^"]*)"/$1 href="$url$2"/mig;
- $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)([^\/][^"]*)"/$1 src="$url$2"/mig;
- # relative to the top of the site
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)(\/[^"]*)"/$1 href="$urltop$2"/mig;
- $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)(\/[^"]*)"/$1 src="$urltop$2"/mig;
- return $content;
+ my $ret="";
+
+ eval q{use HTML::Parser; use HTML::Tagset};
+ die $@ if $@;
+ my $p = HTML::Parser->new(api_version => 3);
+ $p->handler(default => sub { $ret.=join("", @_) }, "text");
+ $p->handler(start => sub {
+ my ($tagname, $pos, $text) = @_;
+ if (ref $HTML::Tagset::linkElements{$tagname}) {
+ while (4 <= @$pos) {
+ # use attribute sets from right to left
+ # to avoid invalidating the offsets
+ # when replacing the values
+ my ($k_offset, $k_len, $v_offset, $v_len) =
+ splice(@$pos, -4);
+ my $attrname = lc(substr($text, $k_offset, $k_len));
+ next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}};
+ next unless $v_offset; # 0 v_offset means no value
+ my $v = substr($text, $v_offset, $v_len);
+ $v =~ s/^([\'\"])(.*)\1$/$2/;
+ if ($v=~/^#/) {
+ $v=$baseurl.$v; # anchor
+ }
+ elsif ($v=~/^(?!\w+:)[^\/]/) {
+ $v=$url.$v; # relative url
+ }
+ elsif ($v=~/^\//) {
+ if (! defined $urltop) {
+ # what is the non path part of the url?
+ my $top_uri = URI->new($url);
+ $top_uri->path_query(""); # reset the path
+ $urltop = $top_uri->as_string;
+ }
+ $v=$urltop.$v; # url relative to top of site
+ }
+ $v =~ s/\"/&quot;/g; # since we quote with ""
+ substr($text, $v_offset, $v_len) = qq("$v");
+ }
+ }
+ $ret.=$text;
+ }, "tagname, tokenpos, text");
+ $p->parse($html);
+ $p->eof;
+
+ return $ret;
}
sub genfeed ($$$$$@) {
diff --git a/debian/changelog b/debian/changelog
index 00c32e95d..0edb78004 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -4,6 +4,10 @@ ikiwiki (3.20101113) UNRELEASED; urgency=low
* more: Add pages parameter to limit where the more is displayed.
(thanks, dark)
* Fix escaping of filenames in historyurl. (Thanks, aj)
+ * inline: Improve RSS url munging to use a proper html parser,
+ and support all elements that HTML::Tagset knows about.
+ (Which doesn't include html5 just yet, but then the old version
+ didn't either.) Bonus: 4 times faster than old regexp method.
-- Joey Hess <joeyh@debian.org> Tue, 16 Nov 2010 14:23:47 -0400