Source file: mdx/lib/Manitou/Attachments.pm
# Copyright (C) 2004-2012 Daniel Verite
# This file is part of Manitou-Mail (see http://www.manitou-mail.org)
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
package Manitou::Attachments;
use strict;
use vars qw(@ISA @EXPORT_OK);
use File::stat;
use Carp;
use POSIX qw(tmpnam);
use Encode;
use Manitou::Encoding qw(encode_dbtxt header_decode);
use Manitou::Log qw(error_log warning_log);
use Manitou::Config qw(getconf);
use IPC::Open3;
use IO::Handle;
use Digest::SHA;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(flatten_and_insert_attach detach_text_attachments
attach_parts create_html_part has_attachments);
sub get_sequence_nextval {
my ($dbh, $seq) = @_;
my ($nextval, $sth, $row);
$sth=$dbh->prepare("SELECT nextval('".$seq."')") or die $dbh->errstr;
$sth->execute() or die $dbh->errstr;
my @row = $sth->fetchrow_array;
if ($row[0]) {
$nextval = $row[0];
} else {
$nextval = 1;
}
$sth->finish;
return $nextval;
}
sub detach_text_attachments {
my ($dbh, $top, $mail_id, $pbody_text, $pbody_html) = @_;
my $attachments;
if ($top->is_multipart) {
# Identify text/plain + text/html combination to insert
# into body (bodytext,bodyhtml) instead of putting the HTML part
# into the attachment table.
if ($top->effective_type eq "multipart/alternative" &&
$top->parts==2 &&
$top->parts(0)->bodyhandle &&
$top->parts(1)->bodyhandle &&
( ($top->parts(0)->effective_type eq "text/plain" &&
$top->parts(1)->effective_type eq "text/html") ||
($top->parts(0)->effective_type eq "text/html" &&
$top->parts(1)->effective_type eq "text/plain")
))
{
for my $i (0,1) {
my $p = $top->parts($i);
my $charset = $p->head->mime_attr("content-type.charset") || 'iso-8859-1';
$charset ='iso-8859-1' if (!Encode::resolve_alias($charset));
if ($p->effective_type eq "text/plain") {
$$pbody_text .= Encode::decode($charset, $p->bodyhandle->as_string);
}
else {
# For HTML, we don't concatenate parts. If contents have already been put
# in the HTML part upper in the recursion steps, then the new contents
# are inserted into the attachment table.
if (!defined $$pbody_html) {
$$pbody_html = Encode::decode($charset, $p->bodyhandle->as_string);
}
else {
flatten_and_insert_attach($dbh, $p, $mail_id);
}
}
}
}
else {
# Generic case. Concat text/plain parts with no filename to the text body
# and insert other parts into the attachment table.
foreach my $p ($top->parts) {
if ($p->effective_type eq "text/plain" &&
$p->head->recommended_filename eq "" &&
$p->bodyhandle) {
my $charset = $p->head->mime_attr("content-type.charset") || 'iso-8859-1';
$charset='iso-8859-1' if (!Encode::resolve_alias($charset));
$$pbody_text .= Encode::decode($charset, $p->bodyhandle->as_string);
}
else {
$attachments += detach_text_attachments($dbh, $p, $mail_id, $pbody_text, $pbody_html);
}
}
}
}
else {
$attachments = flatten_and_insert_attach($dbh, $top, $mail_id);
}
return $attachments;
}
sub flatten_and_insert_attach {
my ($dbh, $top,$mail_id) = @_;
my $attachments;
if ($top->is_multipart) {
foreach ($top->parts) {
$attachments+=flatten_and_insert_attach($dbh, $_, $mail_id);
}
}
else {
if ($top->bodyhandle) {
&insert_attachment($dbh, $mail_id, $top);
$attachments++;
}
}
return $attachments;
}
# remove undesirable characters
sub sanitize_filename {
my $f=$_[0];
$f =~ s/\/\\\"\'\<\>//g;
$f =~ s/\s/_/g;
$f =~ tr/\x00-\x1F//d;
return $f;
}
sub insert_attachment {
my ($dbh, $mail_id, $mime_obj) = @_;
my $attachment_id = get_sequence_nextval($dbh, "seq_attachment_id");
my $lobjId;
my $attch_file=tmpnam();
open(PGIN, ">$attch_file") or die "can not open $attch_file: $!";
$mime_obj->bodyhandle->print(\*PGIN);
close(PGIN);
my $filesize = stat($attch_file)->size;
my $stha = $dbh->prepare("INSERT INTO attachments(attachment_id,mail_id,content_type,content_size,filename,charset,mime_content_id) VALUES (?,?,?,?,?,?,?)") or die $dbh->errstr;
my $pos = 0;
$stha->bind_param(++$pos, $attachment_id);
$stha->bind_param(++$pos, $mail_id);
my $a_type=substr(header_decode($mime_obj->effective_type),0,300);
$stha->bind_param(++$pos, encode_dbtxt($a_type));
$stha->bind_param(++$pos, $filesize);
my $fname=header_decode($mime_obj->head->recommended_filename);
$fname=sanitize_filename(substr($fname,0,300));
$stha->bind_param(++$pos, encode_dbtxt($fname));
my $charset=header_decode($mime_obj->head->mime_attr("content-type.charset"));
$stha->bind_param(++$pos, encode_dbtxt(substr($charset,0,30)));
my $content_id=$mime_obj->get("Content-ID");
# content-ID syntax must be <addr-spec> (RFC2111)
$content_id = ($content_id =~ /^\<(.*)\>$/) ? $1 : undef;
$stha->bind_param(++$pos, encode_dbtxt(header_decode($content_id)));
$stha->execute or die $stha->errstr;
$stha->finish;
if ($filesize>0) {
# compute the fingerprint
my $sha1 = Digest::SHA->new("SHA-1");
open(PGIN, "$attch_file") or die "can not open $attch_file: $!";
binmode PGIN;
$sha1->addfile(*PGIN);
my $fingerprint = $sha1->b64digest;
# check if the content already exists in the database
my $sth1=$dbh->prepare("SELECT content FROM attachment_contents WHERE fingerprint=?");
$sth1->execute($fingerprint) or die $sth1->errstr;
($lobjId)=$sth1->fetchrow_array;
$sth1->finish;
if (!$lobjId) {
# import the contents
$lobjId = $dbh->func($attch_file, 'lo_import');
}
my $sth = $dbh->prepare("INSERT INTO attachment_contents(attachment_id, content, fingerprint) VALUES (?,?,?)") or die $dbh->errstr;
$sth->bind_param(1, $attachment_id);
$sth->bind_param(2, $lobjId);
$sth->bind_param(3, $fingerprint);
$sth->execute or die $sth->errstr;
$sth->finish;
}
unlink($attch_file);
}
# Create the file if it is a file or put the attachment contents to
# the location pointed to by $content
sub get_one_attachment {
my ($dbh, $a_id,$filename,$content_size,$content,$tmpdir)=@_;
my $ret;
my $sth = $dbh->prepare ("SELECT content FROM attachment_contents WHERE " .
"attachment_id=?") ||
die "Can't prepare statement: $DBI::errstr";
$sth->execute($a_id) || die "Can't execute statement: $DBI::errstr";
my @row = $sth->fetchrow_array;
die $sth->errstr if $sth->err;
my $lobjId = $row[0];
$sth->finish;
# Note: this needs to run while inside a transaction (this is required
# for the lo_* functions)
if ($filename) {
$ret = $dbh->func ($lobjId, $tmpdir . "/$filename", 'lo_export');
}
else {
$$content = "";
my $lobj_fd = $dbh->func ($lobjId, $dbh->{pg_INV_READ}, 'lo_open');
die $dbh->errstr if (!defined($lobj_fd));
my $buf;
my $nbytes;
while ($content_size > 0) {
$nbytes = $dbh->func($lobj_fd, $buf, 16384, 'lo_read');
die $dbh->errstr if (!defined($nbytes));
$content_size -= $nbytes;
$$content .= $buf;
}
$dbh->func ($lobj_fd, 'lo_close');
}
}
# Returns true if there are files to be attached, not counting the files
# related to the HTML parts
sub has_attachments {
my ($dbh, $mail_id)=@_;
my $sth = $dbh->prepare("SELECT count(*) FROM attachments WHERE mail_id=? AND mime_content_id IS NULL");
$sth->execute($mail_id);
my @r=$sth->fetchrow_array();
$sth->finish;
return ($r[0]>0);
}
# TODO: find a better way to ignore attachments related to the HTML
# part than mime_content_id IS NULL
sub attach_parts {
my ($dbh,$mail_id,$mobj,$tmpdir)=@_;
my $sth = $dbh->prepare ("SELECT attachment_id,content_type,content_size,filename,mime_content_id FROM attachments WHERE mail_id=? AND mime_content_id IS NULL") || die "Can't prepare statement: $DBI::errstr";
$sth->execute($mail_id) || die "Can't execute statement: $DBI::errstr";
while (my $row = $sth->fetchrow_hashref) {
my $attch_data;
$row->{filename} = sanitize_filename($row->{filename}) if defined($row->{filename});
get_one_attachment($dbh, $row->{attachment_id}, $row->{filename},
$row->{content_size}, \$attch_data, $tmpdir);
my %args = ('Encoding' => '-SUGGEST',
'Disposition' => 'attachment',
'Type' => $row->{content_type});
if (defined $row->{filename}) {
$args{'Path'} = $tmpdir . "/" . $row->{filename};
}
else {
$args{'Data'} = $attch_data;
}
# if ($mime_content_id) {
# $args{'Id'} = $mime_content_id;
# }
$mobj->attach(%args);
}
$sth->finish;
}
# Create the MIME part that holds the HTML contents.
# If the HTML body references external objects with CIDs (probably
# images), then we create a multipart/related, otherwise a single part.
# At the moment, an attachment is assumed to be related to the HTML contents
# if and only if it has a mime_content_id. In a future version, the MIME structure
# might be laid out in the database in advance to provide for more flexibility.
sub create_html_part {
my ($dbh, $mail_id, $ref_html_text) = @_;
my $sth = $dbh->prepare ("SELECT attachment_id,content_type,content_size,mime_content_id FROM attachments WHERE mail_id=? AND mime_content_id IS NOT NULL ORDER BY 1") || die "Can't prepare statement: $DBI::errstr";
$sth->execute($mail_id) || die "Can't execute statement: $DBI::errstr";
my $part;
if ($sth->rows > 0) {
$part = MIME::Entity->build('Type' => 'multipart/related');
$part->attach('Data' => $$ref_html_text,
'Encoding' => '-SUGGEST',
'Type' => 'text/html',
'Charset' => 'utf-8');
while (my $row = $sth->fetchrow_hashref) {
my $attch_data;
get_one_attachment($dbh, $row->{attachment_id}, undef,
$row->{content_size}, \$attch_data);
$part->attach('Encoding' => '-SUGGEST',
'Type' => $row->{content_type},
'Data' => $attch_data,
'Id' => "<" . $row->{mime_content_id} . ">"
);
}
}
else {
$part = MIME::Entity->build('Data' => $$ref_html_text,
'Encoding' => '-SUGGEST',
'Type' => 'text/html',
'Charset' => 'utf-8',
'X-Mailer' => undef);
}
$sth->finish;
return $part;
}
# Input: identity
sub text_extractors {
my $word_extractors = getconf('index_words_extractors', $_[0]);
my %extractors;
if (defined $word_extractors && @{$word_extractors}>0) {
foreach (@{$word_extractors}) { # content_type : program
# TODO: keep only the extractors that match content types for which
# we have actual attachments for this message.
# Get the attachments list in this function and pass them
# to attach_parts() and launch_text_extractors() rather than
# letting these functions query the database.
if (/^(.*)\s*:\s*(.*)\s*$/) {
$extractors{$1}=$2;
}
else {
warning_log("Entry ignored in index_words_extractors: $_");
}
}
}
return %extractors;
}
# $commands: hashref {"content_type"=>"command to extract words"}
# Requires the db connection to be inside a transaction because of the
# operations on large objects
#
# Returns: 0 on failure, 1 otherwise.
sub launch_text_extractors {
my ($dbh, $mail_id, $commands, $ref_text)=@_;
my $sth = $dbh->prepare("SELECT a.attachment_id,a.content_type,a.content_size,ac.content FROM attachments a JOIN attachment_contents ac ON a.attachment_id=ac.attachment_id WHERE a.mail_id=? AND a.mime_content_id IS NULL");
my $errmsg;
$sth->execute($mail_id);
while (my $row = $sth->fetchrow_hashref) {
my $ct=$row->{content_type};
if (exists $commands->{$ct}) {
my $cmd=$commands->{$ct};
my $output;
# Pipe the contents to the extractor and get results into $output
my $ret=0;
my $in=IO::Handle->new();
my $out=IO::Handle->new();
my $err=IO::Handle->new();
eval {
$SIG{'PIPE'} = 'IGNORE';
my $pid = open3($in, $out, $err, $cmd) or die $!;
binmode $out, ':utf8';
$out->blocking(0);
my $bits;
vec($bits, fileno($out), 1)=1;
my $content_size = $row->{content_size};
my $lobj_fd = $dbh->func($row->{content}, $dbh->{pg_INV_READ}, 'lo_open');
die $dbh->errstr if (!defined $lobj_fd);
my $buf;
my $nbytes;
while ($content_size>0) {
$nbytes = $dbh->func($lobj_fd, $buf, $content_size>524288 ? 524288:$content_size, 'lo_read');
die $dbh->errstr if (!defined $nbytes);
$content_size -= $nbytes;
# Send to script
print $in $buf;
while (select(undef, $bits, undef, 0.2)) {
# read the output of the extractor during execution
# to avoid too much buffering
$$ref_text.=<$out>;
}
}
$dbh->func($lobj_fd, 'lo_close');
close($in);
$out->blocking(1);
while (<$out>) {
$$ref_text .=$_;
}
waitpid($pid, 0);
};
my $base_msg="Attachments text extractor execution error (\`$cmd\`, exit code=".($?>>8)."), message #$mail_id, attachment #$row->{attachment_id}";
if ($@) {
$errmsg="$base_msg: $@";
}
else {
my $e=<$err>;
if ($e ne "" || ($?>>8)!=0) {
$errmsg= "$base_msg: $e";
}
}
$SIG{'PIPE'}='DEFAULT';
close($err);
close($out);
if ($errmsg) {
error_log($errmsg);
return 0;
}
}
elsif ($ct eq "text/html") {
# Built-in default extractor for HTML attachments
my $lobj_fd = $dbh->func ($row->{content}, $dbh->{pg_INV_READ}, 'lo_open');
die $dbh->errstr if (!defined $lobj_fd);
my $buf;
if ($dbh->func($lobj_fd, $buf, $row->{content_size}, 'lo_read')) {
$$ref_text .= Manitou::Words::html_to_text($buf);
}
$dbh->func ($lobj_fd, 'lo_close');
}
}
1;
}
1;
HTML source code generated by GNU Source-Highlight plus some custom post-processing
List of all available source files