Source file: src/words.cpp
/* Copyright (C) 2004-2012 Daniel Verite
This file is part of Manitou-Mail (see http://www.manitou-mail.org)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2 as
published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
#include "main.h"
#include "words.h"
#include "db.h"
#include "sqlstream.h"
#ifdef Q_OS_WIN
#include <winsock2.h>
#endif
#if defined(Q_OS_UNIX) || defined(Q_OS_MAC)
#include <netinet/in.h>
#endif
//static
int wordsearch_resultset::m_partsize=16384;
db_word::db_word() : m_word_id(0)
{
}
db_word::~db_word()
{
std::map<uint,bit_vector*>::iterator it;
for (it = m_vectors.begin(); it!=m_vectors.end(); ++it) {
delete it->second;
}
}
//static
bool
db_word::is_non_indexable(const QString w)
{
// DBG_PRINTF(4, "is_non_indexable(%s)\n", w.latin1());
if (w.length()<3)
return true;
db_cnx db;
bool result;
try {
sql_stream s("SELECT 1 FROM non_indexable_words WHERE wordtext=:p1", db);
s << w;
result=!s.eos();
}
catch(db_excpt& p) {
DBEXCPT(p);
result=false;
}
return result;
}
void
db_word::set_text(const QString& txt)
{
m_text=txt;
}
bool
db_word::fetch_id()
{
db_cnx db;
bool result;
try {
sql_stream s("SELECT word_id FROM words WHERE wordtext=:p1", db);
s << m_text;
if (!s.eos())
s >> m_word_id;
else
m_word_id=0;
return true;
}
catch(db_excpt& p) {
DBEXCPT(p);
result=false;
}
return result;
}
/* Return the vector of bits related to 'part_no' partition, or NULL
if that part of the vector if empty */
const bit_vector*
db_word::vector_part(uint part_no) const
{
std::map<uint,bit_vector*>::const_iterator it;
it = m_vectors.find(part_no);
return (it!=m_vectors.end() ? it->second : NULL);
}
// Fetch the inverted index entries related to the word
bool
db_word::fetch_vectors()
{
db_cnx db;
PGconn* pgconn = db.connection();
if (!m_word_id && !fetch_id())
return false;
QString query= QString("SELECT part_no,mailvec,nz_offset FROM inverted_word_index WHERE word_id=%1").arg(m_word_id);
QByteArray qb_query = query.toUtf8();
PGresult* res = PQexecParams(pgconn,
qb_query.constData(),
0, // number of params
NULL, // param types,
NULL, // param values,
NULL, // param lengths
NULL, // param formats
1 // result format=binary
);
if (res && PQresultStatus(res)==PGRES_TUPLES_OK) {
for (int row=0; row<PQntuples(res); row++) {
//int f=PQfformat(res,1);
//Oid o=PQftype(res, 1);
unsigned long partno = ntohl(*(unsigned long*)PQgetvalue(res, row, 0));
uint part=(uint)partno;
unsigned long nzo = ntohl(*(unsigned long*)PQgetvalue(res, row, 2));
uint nz_offset=(uint)nzo;
// DBG_PRINTF(5, "word='%s' word_id=%d, partno=%d, nz_offset=%d, mailvec.length=%d\n",
// m_text.latin1(), m_word_id, part, nz_offset, PQgetlength(res, row, 1));
bit_vector* v = new bit_vector();
v->set_buf((const uchar*)PQgetvalue(res, row, 1),
PQgetlength(res, row, 1),
nz_offset);
/* (word_id,partno) tuples should be unique. TODO: check
the existence of a m_vectors[part] entry before assigning
it as a consistency test */
m_vectors[part] = v;
}
}
else if (res) {
DBEXCPT(pgconn);
return false;
}
if (res)
PQclear(res);
return true;
}
//static
QString
db_word::unaccent(const QString s)
{
QString s2 = s.normalized(QString::NormalizationForm_KD);
QString out;
for (int i=0, j=s2.length(); i<j; i++) {
// strip diacritic marks
if (s2.at(i).category()!=QChar::Mark_NonSpacing &&
s2.at(i).category()!=QChar::Mark_SpacingCombining) {
out.append(s2.at(i));
}
}
return out;
}
//static
void
db_word::unaccent(QStringList& s)
{
for (QStringList::iterator it = s.begin(); it!=s.end(); ++it) {
*it = unaccent(*it);
}
}
// static
QString
db_word::format_db_string_array(const QStringList& words, db_cnx& db)
{
QString txt = "ARRAY[";
for (QStringList::const_iterator it = words.begin(); it!=words.end(); ++it) {
if (it!=words.begin())
txt.append(',');
txt.append("'" + db.escape_string_literal(*it) + "'");
}
txt.append(']');
/* For an empty array, add a cast to avoid the postgres resolve
error: "cannot determine type of empty array". Plus we use '{}'
instead of array[] it's accepted only by PG>=8.4 */
if (words.isEmpty())
txt = "'{}'::text[]"; //txt.append("::text[]");
return txt;
}
wordsearch_resultset::wordsearch_resultset()
{
}
wordsearch_resultset::~wordsearch_resultset()
{
clear();
}
void
wordsearch_resultset::and_word(const db_word& dbw)
{
// DBG_PRINTF(7, "and_word('%s')\n", dbw.text().latin1());
std::map<uint,bit_vector*>::iterator it;
for (it = m_vect.begin(); it!=m_vect.end(); ++it) {
const bit_vector* v = dbw.vector_part(it->first);
if (!v) {
it->second->clear();
}
else {
it->second->and_op(*v);
}
}
}
void
wordsearch_resultset::insert_word(const db_word& dbw)
{
// DBG_PRINTF(7, "insert_word('%s')\n", dbw.text().latin1());
// copy the vectors from dbw
const std::map<uint,bit_vector*>* w_vecs = dbw.vectors();
std::map<uint,bit_vector*>::const_iterator it;
for (it=w_vecs->begin(); it!=w_vecs->end(); ++it) {
bit_vector* v = new bit_vector();
v->set_buf(it->second->buf(), it->second->size());
m_vect[it->first] = v;
}
}
void
wordsearch_resultset::clear()
{
std::map<uint,bit_vector*>::iterator it;
for (it = m_vect.begin(); it!=m_vect.end(); ++it) {
delete it->second;
}
m_vect.clear();
}
/*
get a list of mail_id from the word vectors
limit is a mail_id
if direction==-1, include only messages for which mail_id < limit
if direction==1, include only messages for which mail_id > limit
if direction==0, limit is ignored
*/
void
wordsearch_resultset::get_result_bits(std::list<mail_id_t>& l,
mail_id_t limit,
int direction,
uint max_results)
{
DBG_PRINTF(7, "get_result_bits(limit=%d,direction=%d,max_results=%u)", limit, direction, max_results);
std::map<uint,bit_vector*>::iterator it;
uint cnt_results=0;
/* if (max_results==0)
return;*/
for (it = m_vect.begin(); it!=m_vect.end(); ++it) {
int part_offset = it->first*m_partsize;
uint sz=it->second->size();
const uchar* buf = it->second->buf();
for (uint o=0; o<sz; o++) {
uchar mask=1;
uchar c=buf[o];
if (!c) continue; // shortcut
for (uint i=0; i<8; i++) {
if (c&mask) {
mail_id_t id = part_offset+(o*8)+i+1;
if (direction==0) {
l.push_back(id);
cnt_results++;
}
else if (direction==-1) {
if (id<limit) {
l.push_back(id);
cnt_results++;
}
}
else if (direction==1) {
if (id>limit) {
l.push_back(id);
cnt_results++;
}
}
if (max_results>0 && cnt_results >= max_results)
return; // shortcut to exit
}
mask = mask << 1;
}
}
}
}
//static
bool
progressive_wordsearch::get_index_parts(const QStringList& words)
{
db_cnx db;
bool result=true;
try {
QString query = QString("SELECT * FROM wordsearch_get_parts(%1) ORDER BY 1 desc")
.arg(db_word::format_db_string_array(words, db));
sql_stream s(query, db);
m_parts.clear();
while (!s.eos()) {
int part_no;
s >> part_no;
m_parts.append(part_no);
}
}
catch(db_excpt& p) {
DBEXCPT(p);
result=false;
}
return result;
}
HTML source code generated by GNU Source-Highlight plus some custom post-processing
List of all available source files