Word extractors are external commands that are launched by manitou-mdx with attachments contents piped to their standard input. They extract words and output them encoded in utf-8 to the standard output. manitou-mdx associates these words in the inverted word index to the message being processed.
====== Declaration ======
Extractors are declared in the manitou-mdx configuration file with the **index_words_extractors** multi-line entry. Each line associates an extractor to a MIME type.
Example:
[mailbox@domain.tld]
index_words_extractors = application/pdf: /opt/scripts/pdf2text \
application/msword: /opt/scripts/word2text
The extractors are generally shell scripts wrapping a call to a converter program like [[http://www.winfield.demon.nl/|antiword]] for MS-Word documents, pdftotext from [[http://poppler.freedesktop.org/|poppler]], or [[http://dag.wieers.com/home-made/unoconv/|unoconv]] for OpenOffice documents.
====== Ready-to-use extractors ======
Here is a collection of sample extractors for common file formats:
===== MS-Word [.doc] =====
MIME-type: ''application/msword''
#!/bin/sh
# convert a stdin-doc file to stdout-txt
# use antiword from antiword package
tmpfile=$(tempfile --suffix=.doc) || exit 1
trap "rm -f -- '$tmpfile'" EXIT
cat >>$tmpfile
antiword -i1 "$tmpfile" || exit 1
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== MS-Word Open XML [.docx] =====
MIME-type: ''application/vnd.openxmlformats-officedocument.wordprocessingml.document''
#!/bin/sh
# convert a stdin-docx file to stdout-txt
# use unoconv from unoconv package
# TODO: handle unoconv deadlock (set a background and timeout)
tmpfile=$(tempfile --suffix=.docx) || exit 1
tmpfile2=$(tempfile --suffix=.txt) || exit 1
trap "rm -f -- '$tmpfile' '$tmpfile2'" EXIT
cat >>$tmpfile
unoconv -d=document -f txt "$tmpfile" "$tmpfile2" || exit 1
cat "$tmpfile2"
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== OpenDocument spreadsheets [.ods] =====
MIME-type: ''application/vnd.oasis.opendocument.spreadsheet''
#!/bin/sh
# convert a stdin-ods file to stdout-txt
# use unoconv from unoconv package
# TODO: handle unoconv deadlock (set a background and timeout)
tmpfile=$(tempfile --suffix=.ods) || exit 1
tmpfile2=$(tempfile --suffix=.csv) || exit 1
trap "rm -f -- '$tmpfile' '$tmpfile2'" EXIT
cat >>$tmpfile
unoconv -d=document -f csv "$tmpfile" "$tmpfile2" || exit 1
cat "$tmpfile2"
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== Open Office texts [.odt] =====
MIME-type: ''application/vnd.oasis.opendocument.text''
#!/bin/sh
# convert a stdin-odt file to stdout-txt
# use unoconv from unoconv package
# TODO: handle unoconv deadlock (set a background and timeout)
tmpfile=$(tempfile --suffix=.odt) || exit 1
tmpfile2=$(tempfile --suffix=.txt) || exit 1
trap "rm -f -- '$tmpfile' '$tmpfile2'" EXIT
cat >>$tmpfile
unoconv -d=document -f txt "$tmpfile" "$tmpfile2" || exit 1
cat "$tmpfile2"
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== Portable Document Format [.pdf] =====
MIME-Type: ''application/pdf''
#!/bin/sh
# convert a stdin-pdf file to stdout-txt
# use pdftotext from poppler-utils package
tmpfile=$(tempfile --suffix=.pdf) || exit 1
trap "rm -f -- '$tmpfile'" EXIT
cat >>$tmpfile
pdftotext -q "$tmpfile" - || exit 1
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== MS-Excel spreadsheets [.xls] =====
MIME-Type: ''application/vnd.ms-excel''
#!/bin/sh
# convert a stdin-xls file to stdout-txt
# use unoconv from unoconv package
# TODO: handle unoconv deadlock (set a background and timeout)
tmpfile=$(tempfile --suffix=.xls) || exit 1
tmpfile2=$(tempfile --suffix=.csv) || exit 1
trap "rm -f -- '$tmpfile' '$tmpfile2'" EXIT
cat >>$tmpfile
unoconv -d=document -f csv "$tmpfile" "$tmpfile2" || exit 1
cat "$tmpfile2"
rm -f -- "$tmpfile"
trap - EXIT
exit 0
===== Open XML spreadsheets [.xlsx] =====
MIME-Type: ''application/vnd.openxmlformats-officedocument.spreadsheetml.sheet''
#!/bin/sh
# convert a stdin-xlsx file to stdout-txt
# use unoconv from unoconv package
# TODO: handle unoconv deadlock (set a background and timeout)
tmpfile=$(tempfile --suffix=.xlsx) || exit 1
tmpfile2=$(tempfile --suffix=.csv) || exit 1
trap "rm -f -- '$tmpfile' '$tmpfile2'" EXIT
cat >>$tmpfile
unoconv -d=document -f csv "$tmpfile" "$tmpfile2" || exit 1
cat "$tmpfile2"
rm -f -- "$tmpfile"
trap - EXIT
exit 0