# $Id: url.pl,v 1.40 2002/02/14 07:05:47 jylefort Exp $

use Irssi 20020121.2020 ();
$VERSION = "0.45";
%IRSSI = (
	  authors     => 'Jean-Yves "decadix" Lefort',
	  contact     => 'jylefort\@brutele.be, decadix on IRCNet',
	  name        => 'url',
	  description => 'Simply the most powerful URL grabber for Irssi',
	  license     => 'BSD',
	  changed     => '$Date: 2002/02/14 07:05:47 $ ',
);

# description:
#
#	url.pl will grab URLs in messages and generate a text file.
#	By 'text file', I mean the opposite of binary file; url.pl can
#	generate any type of text file (plain text, HTML, XML, etc).
#
#	The aspect of the generated file will be governed by a template file
#	(see below for a description of its format).
#
#	The user can choose which format to view using /URL <format>.
#
#	For instance if a file named 'template.html' is found in
#	the template directory and the user types /URL html, a nice HTML
#	page will be generated and loaded into a web browser.
#
# quick start:
#
#	* type /URL -CREATE
#	* edit the $command keyword in ~/.irssi/url/template.html
#	* edit the browse_command setting
#
#	You are now ready to view an HTML page containing the grabbed URLs
#	by typing /URL, and to load the last grabbed URL in your browser
#	by typing /URL -.
#
# format of a template file:
#
#	A template file must be named 'template.FORMAT', where FORMAT is any
#	value that will be passed by the user using /URL <format>.
#	FORMAT will be also used to choose the extension of the generated file.
#
#	A template file consists of keyword/value pairs.
#
#	A single line value can be specified like this:
#
#		$KEYWORD: VALUE
#
#	A multiline value can be specified like this:
#
#		$KEYWORD:
#			VALUE
#		$NEXTKEYWORD: ...
#
#	See below for a list of keywords.
#
#	Type /URL -CREATE and look in ~/.irssi/url to see how a
#	template file might look like.
#
# keywords:
#
#	command
#
#		This command will be run on the generated file to view it.
#		%f will be replaced by the full pathname of the file.
#
#	item_timestamp_format
#
#		This will govern the aspect of the %s substitution in
#		item_template (see below).
#
#	file_timestamp_format
#
#		This will govern the aspect of the %s substitution in
#		file_template (see below).
#
#	item_template
#
#		This will govern the appearance of each URL item in the file.
#		The following percent substitutions will be performed:
#
#			%s	time (strftime(3) will be used, with the
#				value of item_timestamp_format passed as
#				a format string)
#			%t	target of message (ie 'decadix', '#irssi'...)
#			%l	message levels (ie 'PUBLIC', 'MSGS')
#			%{	everything before the url (ie 'check this: ')
#			%u	the URL (ie 'www.FreeBSD.org');
#			%}	everything after the url (ie ' - it rules')
#			%m	the whole message (a shortcut to %{%u%})
#			%r	the 'real' URL: the %u above might not really
#				be an URL in the sense intended by RFC1738;
#				in fact url.pl grabs much more than that:
#				for instance 'www.netbsd.org', which is not
#				an URL but a hostname; or 'abuse@decadix.org',
#				which is an email address;
#				%r will be the URL as specified by RFC1738,
#				(eg your web browser will interpret it
#				correctly);
#				for instance 'www.netbsd.org' will be
#				converted to 'http://www.netbsd.org';
#				'ftp.FreeBSD.org' to 'ftp://ftp.FreeBSD.org';
#				'me@decadix.org' to 'mailto:me@decadix.org'
#
#	file_template
#
#		This will govern the appearance of the whole file.
#		The following percent substitutions will be performed:
#
#			%s	time (strftime(3) will be used, with the
#				value of file_timestamp_format passed as
#				a format string)
#			%i	the list of URL items concatenated
#
# /set's:
#
#	url_grab_level
#
#		message levels to take in consideration
#		example: PUBLICS ACTIONS
#
#	url_redundant
#
#		whether to store same URLs multiple times or not
#		example: ON
#
#	url_verbose_grab
#
#		whether to grab verbosely or not
#		example: OFF
#
#	url_hilight
#
#		whether to hilight the URLs in the text or not
#		the 'url_text' format will be used for this purpose (see below)
#		example: OFF
#
#	url_color
#
#		mirc color string to use for url hilighting
#
#	url_browse_command
#
#		a command to display an URL when /URL - is used
#		%u will be replaced by the URL
#		example: galeon %u &
#
#	url_template_path
#
#		where to find the template files
#		example: ~/.irssi/templates
#
#	url_file
#
#		where to write the URL list (the format will be used as
#		the file extension)
#		example: ~/.irssi-urls
#
#	url_default_format
#
#		the default format to use if not specified via /URL <format>
#		example: html
#
# commands
#
#	/URL [-|-clear|-create|<format>]
#
#		- will run your web browser on the last URL.
#
#		-clear will clear the URL list.
#
#		-create will create a set of sample template files in the
#		directory specified in the url_template_path setting; the
#		directory will be created if it doesn't exists.
#
#		<format> will be used to construct the template file to use;
#		for instance if 'html' is specified, the template file used
#		will be 'template.html'.
#
#		If no arguments are specified, the default format found in
#		/set url_default_format will be used.
#
# changes:
#
#	2002-02-14	release 0.45
#			* replaced theme capability by /set url_color,
#			  fixing a bug in the URL hilighting
#
#	2002-02-09	release 0.44
#			* 0.43 didn't grabbed anything: fixed
#
#	2002-02-09	release 0.43
#			* url_hilight was _still_ causing an infinite loop
#			  under certain conditions: fixed
#			* URLs found at the start of a message were
#			  hilighted wrongly: fixed
#
#	2002-02-09	release 0.42
#			* if url_hilight was enabled, an infinite loop was
#			  caused while printing the hilighted message: fixed
#
#	2002-02-08	release 0.41
#			* safer percent substitutions
#			* improved URL regexp
#
#	2002-02-08	release 0.40
#			* added /URL -create command
#			* added url_hilight setting
#
#	2002-02-01	release 0.34
#			* more precise URL regexp
#
#	2002-02-01	release 0.33
#			* added /URL - command
#			* added url_redundant setting
#
#	2002-02-01	release 0.32
#			* some little improvements made in the URL regexp
#
#	2002-01-31	release 0.31
#			* oops, '<@idiot> I am really stupid' was grabbed coz
#			  the '@' mode char trigerred the email regexp
#
#	2002-01-31	release 0.30
#			* major update: not HTML-oriented anymore; can generate
#			  any type of text file by the use of template files
#
#	2002-01-28	release 0.23
#			* changes in url_item and url_item_timestamp_format
#			  settings will now be seen immediately
#			* "Added item #n in URL list" is now printed after
#			  the grabbed message
#
#	2002-01-28	release 0.22
#			* messages are now saved as they were printed in irssi
#			* removed %n format of url_item
#
#	2002-01-27	release 0.21
#			* uses builtin expand
#
#	2002-01-27	release 0.20
#			* added a %s format to url_item
#			* changed the %d format of url_page to %s
#			* added url_{page|item}_timestamp_format settings
#			* reworked the documentation
#
#	2002-01-25	release 0.12
#			* added url_verbose_grab_setting
#	
#	2002-01-24	release 0.11
#			* now handles actions correctly
#
#	2002-01-23	initial release
#
# todo:
#
#	* add a 'url_grab_own_messages' setting

use strict;
use POSIX qw(strftime);
use File::Spec qw(catfile);

use constant MSGLEVEL_NO_URL => 0x0400000;
my $url_regexp = '[a-zA-Z0-9+-.]+:\/\/[^ \t,\[\]]+|(^|(?<=\s))((www\.|ftp\.|irc\.|(mailto:)?[a-zA-Z-_.0-9!%]+@)[a-zA-Z0-9.-]+|[a-zA-Z0-9.-]+\.(com|org|gov|net|edu|biz|info))(\/[^ \t,\[\]]+)?';

my @items;

# -verbatim- import expand
sub expand {
  my ($string, %format) = @_;
  my ($len, $attn, $repl) = (length $string, 0);
  
  $format{'%'} = '%';

  for (my $i = 0; $i < $len; $i++) {
    my $char = substr $string, $i, 1;
    if ($attn) {
      $attn = undef;
      if (exists($format{$char})) {
	$repl .= $format{$char};
      } else {
	$repl .= '%' . $char;
      }
    } elsif ($char eq '%') {
      $attn = 1;
    } else {
      $repl .= $char;
    }
  }
  
  return $repl;
}
# -verbatim- end

sub print_text {
  my ($textdest, $text, $stripped) = @_;
  
  if (! ($textdest->{level} & MSGLEVEL_NO_URL)
      && (Irssi::level2bits(Irssi::settings_get_str('url_grab_level'))
	  & $textdest->{level}) && ($stripped =~ /$url_regexp/o)) {
    my $real = real_url($&);
    
    if (! Irssi::settings_get_bool('url_redundant')) {
      foreach (@items) { return if ($_->{real_url} eq $real) }
    }
    
    push @items,
      {
       time => time,
       target => $textdest->{target},
       level => Irssi::bits2level($textdest->{level}),
       pre_url => $`,
       url => $&,
       post_url => $',
       real_url => $real
      };

    if (Irssi::settings_get_bool('url_hilight')) {
      my $url_pos = index $text, $&;
      $textdest->{level} |= MSGLEVEL_NO_URL;
      Irssi::signal_emit('print text', $textdest,
			 substr($text, 0, $url_pos) .
			 Irssi::settings_get_str('url_color') . $& . '' .
			 substr($text, $url_pos + length $&),
			 $stripped);
      Irssi::signal_stop();
    }
    
    Irssi::print('Added item #' . @items . ' to URL list')
	if Irssi::settings_get_bool('url_verbose_grab');
  }
}

sub real_url {
  my $url = shift;

  return $url if ($url =~ /^([a-zA-Z0-9+-.]+:\/\/|mailto:)/);
  return "mailto:$url" if ($url =~ /^[a-zA-Z-_.0-9!%]+@.*/);
  return "$1://$url" if ($url =~ /^(ftp|irc)\./);
  return "http://$url";
}

sub read_template {
  my $format = shift;
  my ($file) = File::Spec->catfile(glob(Irssi::settings_get_str('url_template_path')),
				   "template.$format");
  my $template = {
		  command => undef,
		  item_template => undef,
		  file_template => undef,
		  file_timestamp_format => undef,
		  item_timestamp_format => undef
		 };
  my ($keyword, $value);

  if (open(FILE, $file)) {
    while (<FILE>) {
      if (/^\$(\w+): ?(.*)\n?/) {
	if (! exists $template->{$1}) {
	  Irssi::print("$file:$.: unknown keyword `$1'", MSGLEVEL_CLIENTERROR);
	  return undef;
	}
	if ($keyword) {
	  $template->{$keyword} = $value;
	  $keyword = undef;
	  $value = undef;
	}
	if ($2) {
	  $template->{$1} = $2;
	} else {
	  $keyword = $1;
	}
      } elsif ($keyword) {
	$value .= $_;
      }
    }
    close(FILE);
    return $template;
  } else {
    Irssi::print("Unable to open template $file: $!", MSGLEVEL_CLIENTERROR);
    return undef;
  }
}

sub create_templates {
  my ($path) = glob Irssi::settings_get_str('url_template_path');
  my %template;

  unless (-d $path || mkdir $path, 0777) {
    Irssi::print("Unable to create directory $path: $!", MSGLEVEL_CLIENTERROR);
    return;
  }

  $template{html} = <<'EOF';
$command: netscape %f >/dev/null &
$item_template: <tr><td>%s<td>%t<td>%l<td>%{<a href="%r">%u</a>%}</tr>
$file_template:
<html>
  <head>
    <title>IRC URL list</title>
  </head>
  <body>
    <center>
      <table border="1" cellpadding="5">
        <caption>IRC URL list</caption>
        <tr><th>time<th>target<th>level<th>message</tr>
        %i
      </table>
    </center>
    <hr>
    <center><small>Generated %s by url.pl</small></center>
  </body>
</html>
$item_timestamp_format: %T
$file_timestamp_format: %c
EOF

  $template{text} = <<'EOF';
$command: xterm -e less %f &
$item_template:
 time		%s
 target		%t
 level		%l
 message	%m
 url		%u

$file_template:
 IRC URL list
-------------------------------------------------------------------------------

%i-------------------------------------------------------------------------------
 Generated %s by url.pl
$item_timestamp_format: %T
$file_timestamp_format: %c
EOF
  
  foreach (keys %template) {
    my $file = File::Spec->catfile($path, "template.$_");
    if (open(FILE, ">$file")) {
      print FILE $template{$_};
      close(FILE);
      Irssi::print("Created $file");
    } else {
      Irssi::print("Unable to open $file for writing: $!",
		   MSGLEVEL_CLIENTERROR);
    }
  }
}

sub output {
  my ($file, $template) = @_;

  if (open(FILE, ">$file")) {
    my $data;
    foreach (@items) {
      $data .= expand($template->{item_template},
		      's', strftime($template->{item_timestamp_format},
				    localtime $_->{time}),
		      't', $_->{target},
		      'l', $_->{level},
		      '{', $_->{pre_url},
		      'u', $_->{url},
		      '}', $_->{post_url},
		      'm', $_->{pre_url} . $_->{url} . $_->{post_url},
		      'r', $_->{real_url});
    }
    print FILE expand($template->{file_template},
		      'i', $data,
		      's', strftime($template->{file_timestamp_format},
				    localtime));
    close(FILE);
    return 1;
  } else {
    Irssi::print("Unable to open $file for writing", MSGLEVEL_CLIENTERROR);
    return undef;
  }
}

sub url {
  my ($args, $server, $item) = @_;
  my ($file, $format, $template);

  if ($args) {
    if (lc $args eq '-clear') {
      @items = ();
      Irssi::print('URL list cleared');
      return;
    } elsif (lc $args eq '-create') {
      create_templates();
      return;
    } elsif ($args eq '-') {
      if (@items) {
	system(expand(Irssi::settings_get_str('browse_command'),
		      'u', $items[$#items]->{real_url}));
      } else {
	Irssi::print('URL list is empty');
      }
      return;
    } else {
      $format = $args;
    }
  } else {
    $format = Irssi::settings_get_str('url_default_format');
  }
  
  if (@items) {
    ($file) = glob Irssi::settings_get_str('url_file') . ".$format";
    return unless $template = read_template($format);
    system(expand($template->{command}, 'f', $file))
      if (output($file, $template));
  } else {
    Irssi::print('URL list is empty');
  }
}

Irssi::settings_add_str('misc', 'url_grab_level',
			'PUBLIC TOPICS ACTIONS DCCMSGS');
Irssi::settings_add_bool('lookandfeel', 'url_verbose_grab', 1);
Irssi::settings_add_bool('lookandfeel', 'url_hilight', 1);
Irssi::settings_add_str('lookandfeel', 'url_color', '12');
Irssi::settings_add_bool('misc', 'url_redundant', 0);
Irssi::settings_add_str('misc', 'browse_command',
			'galeon-wrapper %u >/dev/null &');
Irssi::settings_add_str('misc', 'url_template_path', '~/.irssi/url');
Irssi::settings_add_str('misc', 'url_file', '~/.irc_url_list');
Irssi::settings_add_str('misc', 'url_default_format', 'html');

Irssi::signal_add('print text', 'print_text');

Irssi::command_bind('url', 'url');
