Manitou-Mail logo title

Source file: src/text_merger.cpp

/* Copyright (C) 2004-2011 Daniel Verite

   This file is part of Manitou-Mail (see http://www.manitou-mail.org)

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License version 2 as
   published by the Free Software Foundation.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.
*/

#include <QString>
#include <QApplication>
#include <QStringList>
#include <QTextCodec>
#include <QFile>

#include "text_merger.h"

//#define DEBUG

/* Usage:
  QFile f("data_file.txt");
  QString tmpl = "Template with {field1}, {field2}, ...";
  if (f.open(QIODevice::Text| QIODevice::ReadOnly)) {
    try {
      text_merger tm;
      tm.init(&f);
      while (!f.atEnd()) {
	QString res=tm.merge_record(&f, tmpl);
	// do something with res
      }
    }
    catch(QString error) {
      // ...
    }
  }
}
*/

text_merger::text_merger()
{
  // TODO: optionally use a different, per-file codec
  m_codec = QTextCodec::codecForLocale();
  if (!m_codec) {
    throw QObject::tr("No usable codec for the current locale");
  }
  m_separator = m_default_separator;
}

const QChar
text_merger::m_default_separator=',';

// The variable name will be captured in the parenthesized subexpression
const QString
text_merger::m_placeholder_regexp="\\{\\{(.+)\\}\\}";

int
text_merger::nb_lines() const
{
  return m_line;
}

int
text_merger::nb_columns() const
{
  return m_field_names.size();
}


// return the list of column names, sorted by their index
QStringList
text_merger::column_names() const
{
  QStringList cols;
  for (int i=0; i<m_field_names.size(); i++) {
    cols.append(m_field_names.key(i));
  }
  return cols;
}

QString
text_merger::csv_join(const QStringList list, QChar separator)
{
  QString result;
  QStringList::const_iterator it = list.constBegin();
  for (; it!=list.constEnd(); ++it) {
    QString field = *it;
    field.replace('"', "\"\""); // double the double quotes 
    if (!result.isEmpty())
      result.append(separator);
    result.append(QString("\"%1\"").arg(field));
  }
  return result;
}


QString
text_merger::column_name(int index) const
{
  QMap<QString,int>::const_iterator it = m_field_names.constBegin();
  for (; it!=m_field_names.constEnd(); ++it) {
    if (it.value()==index)
      return it.key();
  }
  return QString::null;
}

// can throw a QString
void
text_merger::init(QIODevice* io)
{
  QStringList columns = parse_header(io);
  for (int i=0; i<columns.size(); i++)
    m_field_names.insert(columns.at(i), i);

  m_line=1;
}

void
text_merger::init(const QStringList columns, QChar separator)
{
  m_line=1;
  for (int i=0; i<columns.size(); i++)
    m_field_names.insert(columns.at(i), i);
  m_separator = separator;
}

QString
text_merger::merge_record(QIODevice* io, const QString tmpl)
{
  if (m_field_names.isEmpty()) {
    throw QObject::tr("No merge fields in data file.");
  }
  QStringList values=collect_data(io);
  if (values.isEmpty())
    return QString::null;
  // discard empty lines (LF or CRLF alone). This is mostly useful if encountered at the end
  if (values.size()==1 && values.at(0).isEmpty())
    return QString::null;
  if (values.size()==m_field_names.size()) {
    return merge_template(tmpl, values);
  }
  else
    throw QObject::tr("%1 field(s) at line %2 when %3 were expected").arg(values.size()).arg(m_line).arg(m_field_names.size());
  
}

QString
text_merger::merge_template(const QString tmpl, const QStringList values)
{
  QString result=tmpl;
  QRegExp rx(m_placeholder_regexp);
  rx.setMinimal(true); // non-greedy
  /* Search for every occurrence of {something} in the template.
     For each occurrence, see if that's one of our fields.
     If yes, replace it with its value, if no then ignore it. */
  QMap<QString,int>::const_iterator it;
  int pos=0;
  while ((pos=rx.indexIn(result, pos)) >= 0) {
    //    QString name=result.mid(pos+1, rx.matchedLength()-2);
    it = m_field_names.constFind(rx.cap(1));
    if (it!=m_field_names.constEnd()) {
      int field_pos = it.value();
      if (field_pos < values.size()) {
	result.replace(pos, rx.matchedLength(), values.at(field_pos));
	pos += values.at(field_pos).length();
      }
      else
	pos += rx.matchedLength(); // actually an ERROR, since the field should have been in 'values'
    }
    else
      pos += rx.matchedLength();
  }
  return result;
}

QStringList
text_merger::column_names(const QString header)
{
  QStringList list = header.split(m_separator);
  return list;
}

/* Parse the first line of the CSV file. The line should contain the column names.
   Parse rules for the header differ from the data by the fact that:
   - one line only, no newline allowed inside a column name
   - we don't know the field separator before parsing, the parser has to find it
   - no double-quote is permitted inside a column name
*/

bool
text_merger::is_separator(QChar c)
{
  return (c==',' || c==';' || c=='\t');
}

QStringList
text_merger::parse_header(QIODevice* io)
{
  int state=1; // parse state
  int trans; // transition to next parse state
  QString field_value; // current field name
  QStringList fields;
  int column=0;
  QChar c;
  QString errmsg;
  bool separator_found=false;

  m_separator = QChar();

  QByteArray bytes = io->readLine();
  QString data_line = m_codec->toUnicode(bytes).trimmed();

  for (column=1; column<=data_line.length(); column++) {
    c = data_line.at(column-1);
    trans=0;
    switch(state) {
      // state 1: initial, at the beginning or end of a field
    case 1:
      if ((separator_found && c==m_separator) || is_separator(c)) {
	// empty field
	trans = 7;
	errmsg = QObject::tr("Empty field");
      }
      else if (c=='"')
	trans=2;
      else {
	trans=4;
	field_value.append(c);
      }
      break;

      // state 2: inside a section beginning with a quote
    case 2:
      if (c=='"')
	trans=3; // end of quoted section or quoted quote
      else {
	trans=2;
	field_value.append(c);
      }
      break;

      // state 3: after a quote encountered inside a quoted section
    case 3:
      if (!separator_found && is_separator(c)) {
	m_separator=c;
	separator_found=true;
	if (field_value.isEmpty()) {
	  trans=7;
	  errmsg = QObject::tr("Empty field");
	}
	else {
	  trans=1;
	  fields.append(field_value);
	  field_value.truncate(0);
	}
      }
      else if (c==m_separator) {
	if (field_value.isEmpty()) {
	  trans=7;
	  errmsg = QObject::tr("Empty field");
	}
	else {
	  trans=1;
	  fields.append(field_value);
	  field_value.truncate(0);
	}
      }
      else if (c=='"') {
	trans=7;
	errmsg = QObject::tr("Double-quote not allowed inside a quoted section in CSV header");
      }
      else {
	trans=7;
	errmsg = QObject::tr("A double quote at the end of a field must be followed by another field or an end of line");
      }
      break;

    // state 4: inside a field
    case 4:
      if (c=='"') {
	trans=7;
	errmsg = QObject::tr("A double quote is not allowed in a field name");
      }
      else if (is_separator(c) && !separator_found) {
	m_separator=c;
	separator_found=true;
	trans=1;
	fields.append(field_value);
	field_value.truncate(0);
      }
      else if (c==m_separator) {
	trans=1;
	fields.append(field_value);
	field_value.truncate(0);
      }
      else {
	trans=4;
	field_value.append(c);
      }
      break;

    case 7:
    default:
      throw QObject::tr("Unhandled parser state %1").arg(state);
    }

    if (trans>0) {
      state=trans;
      if (state==7) {
	QString exc_msg = QObject::tr("Syntax error line 1 column %1.").arg(column);
	if (!errmsg.isEmpty())
	  exc_msg += "\n" + errmsg;
	throw exc_msg;
      }
    }
  }

  if (state==4 || state==3)
    fields.append(field_value);

  return fields;
}

QStringList
text_merger::collect_data(QIODevice* io)
{
  int state=1; // parse state
  int trans; // transition to next parse state
  QString field_value;
  QStringList fields;
  int column=0;
  bool parse_end=false;
  QChar c;
  bool line_complete=false;
  QString errmsg;

  do {
    if (io->atEnd()) { // OK if the parser's state is final (1 or 4)
      if (state==4 || state==3) {
	fields.append(field_value);
      }
      else if (state!=1) {
	throw QObject::tr("Unexpected end of file.");
      }
      parse_end=true;
    }
    else {
      QByteArray bytes = io->readLine();
      m_line++;
      QString data_line = m_codec->toUnicode(bytes);
      for (column=1; column<=data_line.length(); column++) {
	c = data_line.at(column-1);
	trans=0;
	switch(state) {
	  // state 1: initial, at the beginning or end of a field
	case 1:
	  if (c=='\r')
	    trans=10;
	  else if (c=='\n') { // empty line?
	    line_complete=true;
	    trans=1;
	  }
	  else if (c==m_separator) {
	    // empty field
	    fields.append("");
	    trans=1;
	  }
	  else if (c=='"')
	    trans=2;
	  else {
	    trans=4;
	    field_value.append(c);
	  }
	  break;

	  // state 2: inside a section beginning with a quote
	case 2:
	  if (c=='"')
	    trans=3; // end of quoted section or quoted quote
	  else {
	    trans=2;
	    field_value.append(c);
	  }
	  break;

	  // state 3: after a quote encountered inside a quoted section
	case 3:
	  if (c==m_separator) {
	    trans=1;
	    fields.append(field_value);
	    field_value.truncate(0);
	  }
	  else if (c=='"') {
	    trans=2;
	    field_value.append(c);
	  }
	  else if (c=='\r')
	    trans=10;
	  else if (c=='\n') {
	    trans=1;
	    line_complete=true;
	  }
	  else {
	    trans=7;
	    errmsg = QObject::tr("A double quote at the end of a field must be followed by another field or an end of line");
	  }
	  break;
	  // state 4: inside a field
	case 4:
	  if (c=='"') {
	    trans=7;
	    errmsg = QObject::tr("A double quote is allowed in a field only if that field is enclosed in double quotes");
	  }
	  else if (c=='\r')
	    trans=10;
	  else if (c=='\n') {
	    trans=1;
	    line_complete=true;
	  }
	  else if (c==m_separator) {
	    trans=1;
	    fields.append(field_value);
	    field_value.truncate(0);
	  }
	  else {
	    trans=4;
	    field_value.append(c);
	  }
	  break;

	  // state 10: Go there on finding CR, to handle CR,LF. Not used if the underlying IO has already
	  // converted CR,LF to LF
	case 10:
	  if (c=='\n') {
	    trans=1;
	    line_complete=true;
	  }
	  else {
	    trans=7;
	    errmsg = QObject::tr("Carriage return followed by unexpected character (a newline was expected)");
	  }
	  break;
	
	case 7:
	default:
	  throw QObject::tr("Unhandled parser state %1").arg(state);
	}

	if (trans>0) {
	  state=trans;
	  if (state==7) {
	    QString exc_msg = QObject::tr("Syntax error line %1 column %2.").arg(m_line).arg(column);
	    if (!errmsg.isEmpty())
	      exc_msg += "\n" + errmsg;
	    throw exc_msg;
	  }
	}
	if (line_complete) {
	  line_complete=false;
	  fields.append(field_value);
	  //	  field_value.truncate(0);
	  
	  //	  result.append(fields);
	  // fields.clear();
	  parse_end=true;
	}
      }
    }
  } while (!parse_end);

  return fields;
}

QChar
text_merger::guess_separator(const QString header)
{
  for (int i=0; i<header.length(); i++) {
    QChar c = header.at(i);
    if (is_separator(c) && i>0)
      return c;
  }
  // if no separator was found, maybe there's only one column.
  return m_default_separator;
}

//static
void
text_merger::extract_variables(const QString tmpl, QSet<QString>& vars)
{
  QRegExp rx(m_placeholder_regexp);
  rx.setMinimal(true); // non-greedy
  /* Search for every occurrence of {something} in the template. */
  int pos=0;
  while ((pos=rx.indexIn(tmpl, pos)) >= 0) {
    vars.insert(rx.cap(1));
    pos += rx.matchedLength();
  }
}


HTML source code generated by GNU Source-Highlight plus some custom post-processing

List of all available source files