use 5.014;
use utf8;
use Encode;
use Irssi;
use POSIX ();

our $VERSION = "1.4";
our %IRSSI = (
    authors     => 'David Leadbeater',
    contact     => 'dgl@dgl.cx',
    name        => 'urlinfo',
    description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)',
    license     => 'WTFPL/MIT <http://dgl.cx/licence>',
    url         => 'http://dgl.cx/irssi',
);

# This is needed so it can still run standalone for testing.
BEGIN {
  Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17
}

BEGIN {
  eval {
    require HTML::TreeBuilder;
    require URI;
  } or do {
    print "\x{3}8You need to install HTML::TreeBuilder and URI";
    if (-f "/etc/debian_version") {
      print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl";
    }
    die $@;
  }
}

# ------- Settings
# /SET urlinfo_title_unknown ON|OFF
#   Show title of all unknown sites. (There is a timeout to prevent against
#   obvious resource exhaustion attacks, but remember this script has no
#   warranty.)
#
# /SET urlinfo_timeout 10
#   How many seconds after which to give up trying to fetch a URL
#
# /SET urlinfo_ignore_domains example\.org example\.com
#   Space separated list of regular expressions of domains to ignore
#
# /SET urlinfo_ignore_targets freenode #something efnet/#example
#   Space separated list of targets to ignore.
#
# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description
#   A limited way of configuring custom domains, if you need something more
#   complex edit SITES below.
#   Format: domain[/path[=from]]

# ------- Sites configuration

# This script aims to be data driven, this hash has "site_name => {site details}"
# site details is a hash reference which can contain:
#   cleanup: A regexp of text to remove from the resulting string
#   domain: A string (or regexp, with qr//) of the domain to match (www. is
#     removed automatically).
#   from: Where to read the info "title", "description" (meta description) or
#     a regexp to match the content (default "title")
#   example: Example of this URL
#   expected: What the example should return (see end for testing)
#   items: An array ref of additional hashes to allow multiple of these values
#     e.g.: items => [ { domain => "example.com" } ]
#   path: Path component (string or regexp)
#
my %SITES = (
  vimeo => {
    cleanup => qr/\s*on Vimeo$/,
    domain => "vimeo.com",
    path => qr{/\d+},
    example => "http://vimeo.com/80871338",
    expected => "Journey Part 1",
  },
  youtube => {
    cleanup => qr/\s*-\s*YouTube$/,
    items => [
      {
        domain => "youtu.be",
        example => "http://youtu.be/wa1c6EU2bY0",
        expected => "I Am The Resurrection (Remastered)",
      },
      {
        domain => "youtube.com",
        path => "/watch",
        example => "https://www.youtube.com/watch?v=q99JgYrgzco&list=PLE57B71744156439A",
        expected => "I Wanna Be Adored (Remastered)",
      },
    ],
  },
  metacpan => {
    cleanup => qr/\s*-\s*metacpan\.org$/,
    domain => "metacpan.org",
    path => qr{^/pod/},
    example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm",
    from => "description",
    expected => "read, eval, print, loop, yay!",
  },
  pypi => {
    domain => "pypi.python.org",
    path => qr{^/pypi/},
    from => "description",
    example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0",
    expected => "A Stanford Core NLP wrapper (wordseer fork)",
  },
  gist => {
    cleanup => qr/\s*-\s*Gist is .*$/,
    domain => "gist.github.com",
    from => ["og:title", "description"],
    example => "https://gist.github.com/dgl/792206",
    expected => "dgl/installblead: An install script that installs a development version of perl (from git) and keeps a particular set of modules installed. Sort of perlbrew for blead, but not quite.",
  },
  github => {
    domain => "github.com",
    items => [
      { # issue, commit or pull
        cleanup => qr/\s*·.*$/,
        path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]},
        example => "https://github.com/irssi/irssi/commit/669add",
        expected => "FS#155 hilight -tag",
      },
      { # user or project
        from => "og:description",
        path => qr{^/[^/]+(?:/[^/]+)?$},
        example => "https://github.com/irssi/irssi",
        expected => "irssi - The client of the future",
      },
    ],
  },
);

# ------- Site handling

sub expand {
  my @expanded_sites;
  for my $site(keys %SITES) {
    expand_site(\@expanded_sites, $site, $SITES{$site}, {});
  }
  return @expanded_sites;
}

# This essentially implements inheritance (via the "items" key), to reduce
# duplication.
sub expand_site {
  my($expanded_sites, $site, $site_data, $current) = @_;
  delete $current->{items};
  my $s = {
    name => $site,
    from => "title",
    %$current,
    %$site_data,
  };
  if (exists $s->{items}) {
    expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}};
  } else {
    push @$expanded_sites, $s;
  }
}

sub _matcher {
  my($site, $item) = @_;
  return 1 unless defined $site;
  return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site;
  return 1 if ref $site && $site->isa("Regexp") && $item =~ $site;
  return $site eq $item;
}

sub get_site {
  my($sites, $url) = @_;

  my $uri = URI->new($url);
  $uri = URI->new("http://$url") unless $uri and $uri->scheme;
  return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/;
  
  for my $site(@$sites) {
    my $match = 1;
    $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri);
    $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment);
    return $site, $uri if $match;
  }

  if (Irssi::settings_get_bool("urlinfo_title_unknown")) {
    return { name => "unknown", from => "title" }, $uri;
  }

  return;
}

my %from = (
  title => sub {
    $_[0]->look_down(_tag => 'title')->as_trimmed_text;
  },
  description => sub {
    my $el = $_[0]->look_down(_tag => 'meta', name => 'description');
    $el && $el->attr('content');
  },
  'og:description' => sub {
    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description');
    $el && $el->attr('content');
  },
  'og:title' => sub {
    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title');
    $el && $el->attr('content');
  },
);

sub get_info {
  my($site, $uri) = @_;
  my $from = $site->{from};
  my $tree = HTML::TreeBuilder->new_from_url($uri);
  my $info;
  if (!ref $from || ref $from eq 'ARRAY') {
    $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from;
  } else {
    $info = join "", $tree->as_html =~ $from;
  }
  $info =~ s/$site->{cleanup}// if $site->{cleanup};
  $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger;
}

# ------- IRC message handling

# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc)
my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))};

my $pipe_in_progress;
my @sites;
my $timeout = 10;

sub msg {
  my($server, $text, $nick, undef, $target) = @_;
  # TODO: Add a queue / multiple pipe support?
  return if $pipe_in_progress;

  my $msg_time = time;
  my $tag = $server->{tag};
  my $target = $target || $nick;

  if (my($url) = $text =~ $URL_RE) {
    my($site, $uri) = get_site(\@sites, $url);
    return unless $site;
    return if ignored($uri, $server, $target);

    fork_wrapper(sub { # Child
      my($fh) = @_;
      syswrite $fh, "  " . encode_utf8(get_info($site, $uri));
    },
    sub { # Parent
      my $in = decode_utf8($_[0]);
      if ($in =~ s/^- //) {
        print "\x{3}4urlinfo error:\x{3} $in";
        return;
      }
      $in =~ s/^  //;
      return unless $in;

      # Avoid reusing server just in case it is no longer valid
      my $server = Irssi::server_find_tag($tag);
      my $win = find_window($server, $target);

      my $view = $win->view;
      my $line = $view->get_lines;
      while ($line && ($line = $line->next)) {
        if ($line->{info}->{time} >= $msg_time) {
          if ($line->get_text(0) =~ /\Q$url/) {
            last;
          }
        }
      }

      my $timestamp = POSIX::strftime(
        Irssi::settings_get_str("timestamp_format"), localtime $msg_time);
      # I'm sure I shouldn't have to care about colours here...
      my $pad = length Irssi::strip_codes($timestamp);

      my $text = $win->format_get_text(__PACKAGE__, $server, $target,
        "urlinfo", " " x $pad, $in);
      $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP,
        $text, $msg_time);
      $view->redraw;
    });
  }
}

sub ignored {
  my($uri, $server, $target) = @_;
  my @ignored_domains = split / /, Irssi::settings_get_str('urlinfo_ignore_domains');
  my $domain = $uri->host =~ s/^www\.//r;
  return 1 if grep $domain =~ /^$_$/, @ignored_domains;

  my $chans = $server->isupport("chantypes") || '#&';
  my $chan_match = qr/^[$chans]/;

  for my $ignored_target (split / /, Irssi::settings_get_str('urlinfo_ignore_targets')) {
    my($mtag, $mtarget) = split m{/}, $ignored_target;
    if ($mtag =~ $chan_match) {
      $mtarget = $mtag;
      $mtag = "*";
    }
    return 1 if _match($mtag, $server->{tag}) &&
      (!$mtarget || _match($mtarget, $target));
  }

  return 0;
}

sub _match {
  my($pattern, $name) = @_;
  $pattern =~ s/\*/.*/g;
  $name =~ /^$pattern$/i;
}

sub find_window {
  my($server, $target) = @_;
  if (my $witem = $server->window_item_find($target)) {
    return $witem->window;
  } else {
    # Maybe they have a msgs window?
    my $win = Irssi::window_find_name("(msgs)");
    # Ultimate fallback
    $win = Irssi::window_find_refnum(1) unless $win;
    return $win;
  }
}

# Based on scriptassist.
sub fork_wrapper {
  my($child, $parent) = @_;

  pipe(my $rfh, my $wfh);

  my $pid = fork;
  $pipe_in_progress = 1;

  return unless defined $pid;

  if ($pid) {
    close $wfh;
    Irssi::pidwait_add($pid);
    my $pipetag;
    my @args = ($rfh, \$pipetag, $parent);
    $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args);
  } else {
    eval {
      local $SIG{ALRM} = sub { die "Timed out\n" };
      alarm $timeout;
      $child->($wfh);
    };
    alarm 0;
    syswrite $wfh, encode_utf8("- $@") if $@;
    POSIX::_exit(1);
  }
}

sub pipe_input {
  my ($rfh, $pipetag, $parent) = @{$_[0]};
  my $line = <$rfh>;
  close($rfh);
  Irssi::input_remove($$pipetag);
  $pipe_in_progress = 0;
  $parent->($line);
}

sub setup_changed {
  $timeout = Irssi::settings_get_int("urlinfo_timeout");

  @sites = expand();
  for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) {
    next unless $site;

    my($re, $from) = split /=/, $site;
    $from ||= "title";
    my($domain, $path) = split m{/}, $re, 2;
    expand_site(\@sites, "custom", {
        domain => qr/^$domain$/,
        path => defined $path ? qr/^\/$path/ : undef,
        from => $from,
    }, {});
  }
}

# ------- Initialization

if (caller) {
  # Irssi specific initialization
  require Irssi::TextUI;

  Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", "");
  Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", "");
  Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_targets", "");
  Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout);
  Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0);

  Irssi::signal_add("message irc action" => \&msg);
  Irssi::signal_add("message private" => \&msg);
  Irssi::signal_add("message public" => \&msg);

  Irssi::signal_add_last("setup changed", \&setup_changed);
  setup_changed();

  Irssi::theme_register([
    'urlinfo' => '$0 %Kinfo:%n $1',
  ]);

} else {
  # Built in test. Run this script outside Irssi to use.
  @sites = expand();
  for my $site(@sites) {
    next unless $site->{example};
    my($found_site, $uri) = get_site(\@sites, $site->{example});
    if ($found_site != $site) {
      die "Got $found_site->{name}, expected $site->{name}";
    }
    say "Get $uri";
    my $result = get_info($site, $uri);
    say $result;
    die "Got $result, expected $site->{expected}" unless $result eq $site->{expected};
  }
  say "OK";
}
