Source code for beansoup.importers.csv

"""Utilities to implement CSV importers."""

import collections
import csv
import datetime
import io
import itertools
import logging
from os import path
import re

from beancount.core import account_types as atypes
from beancount.core import amount
from beancount.core import data
from beancount.ingest import importer

from beansoup.utils import periods


# A row of the parsed CSV file.
#
# Attributes:
#   lineno: An int identifying the line of the CSV file where this row is found.
#   date: A datetime.date object; the date of the transaction.
#   description: A string; a description of the transaction.
#   amount: A beancount.core.number.Decimal object; the value of the transaction.
#     The sign of its value should be the same as normally used by beancount entries.
#   balance: A beancount.core.number.Decimal object; the balance of the account
#     immediately following the transaction. The sign of its value should be the
#     same as normally used by beancount entries.
Row = collections.namedtuple('Row', 'lineno date description amount balance')


[docs]class Importer(importer.ImporterProtocol):
    """An importer base class for CSV bank and credit card statements.

    Unfortunately, CSV files often do not contain any information to easily
    identify the account; for this reason, this importer relies on the name
    of the file to associate it to a particular account.

    Derived classes need to implement the 'parse' method.

    See beansoup.importers.td.Importer for a full example of how to derive a
    concrete importer from this class.
    """
    def __init__(self, account, currency='CAD', basename=None,
                 first_day=None, filename_regexp=None, account_types=None):
        """Create a new importer for the given account.

        Args:
          account: An account string, the account to associate to the files.
          account_types: An AccountTypes object or None to use the default ones.
          basename: An optional string, the name of the new files.
          currency: A string, the currency for all extracted transactions.
          filename_regexp: A regular expression string used to match the
            basename (no path) of the target file.
          first_day: An int in [1,28]; the first day of the statement/billing
            period or None. If None, the file date will be the date of the
            last extracted entry; otherwise, it will be the date of the end
            of the monthly period containing the last extracted entry.
            Also, if a balance directive can be generated, if None, the balance
            directive will be set to the day following the date of the last
            extracted entry; otherwise, it will be set to the day following the
            end of the statement period.
        """
        self.filename_re = re.compile(filename_regexp or self.filename_regexp)
        self.account = account
        self.currency = currency.upper()
        self.basename = basename
        self.first_day = first_day
        self.account_sign = atypes.get_account_sign(account, account_types)

[docs]    def name(self):
        """Include the account in the name."""
        return '{}: "{}"'.format(super().name(), self.file_account(None))

[docs]    def identify(self, file):
        """Identify whether the file can be processed by this importer."""
        # Match for a compatible MIME type.
        if file.mimetype() != 'text/csv':
            return False

        # Match the file name.
        return self.filename_re.match(path.basename(file.name))

[docs]    def file_account(self, _):
        """Return the account associated with the file"""
        return self.account

[docs]    def file_name(self, file):
        """Return the optional renamed account file name."""
        if self.basename:
            return self.basename + path.splitext(file.name)[1]

[docs]    def file_date(self, file):
        """Return the filing date for the file."""
        rows = self.parse(file)
        date = max(row.date for row in rows)
        if self.first_day is not None:
            date = periods.lowest_end(date, first_day=self.first_day)
        return date

[docs]    def extract(self, file):
        """Return extracted entries and errors."""
        rows = self.parse(file)
        rows, error_lineno = sort_rows(rows)
        new_entries = []
        if len(rows) == 0:
            return new_entries

        for index, row in enumerate(rows):
            posting = data.Posting(
                self.account,
                amount.Amount(row.amount, self.currency),
                None, None, None, None)
            # Use the final positional index rather than the lineno of the row because
            # bean-extract will sort the entries returned by its importers; doing that
            # using the original line number of the parsed CSV row would undo all the
            # effort we did to find their correct chronological order.
            meta = data.new_metadata(file.name, index)
            payee = None
            narration = row.description
            entry = data.Transaction(
                meta,
                row.date,
                self.FLAG,
                payee,
                narration,
                None,
                None,
                [posting])
            new_entries.append(entry)

        # Extract balance, but only if we can trust it
        if error_lineno is not None:
            logging.warning('{}:{}: cannot reorder rows to agree with balance values'.format(
                file.name, error_lineno))
        elif self.first_day is None:
            # Create one single balance entry on the day following the last transaction
            last_row = rows[-1]
            date = last_row.date + datetime.timedelta(days=1)
            balance_entry = self.create_balance_entry(
                file.name, date, last_row.balance)
            new_entries.append(balance_entry)
        else:
            # Create monthly balance entries starting from the most recent one
            balance_date = periods.next(periods.greatest_start(rows[-1].date,
                                                               first_day=self.first_day))
            for row in reversed(rows):
                if row.date < balance_date:
                    new_entries.append(self.create_balance_entry(
                        file.name, balance_date, row.balance))
                    balance_date = periods.prev(balance_date)

        return new_entries

[docs]    def create_balance_entry(self, filename, date, balance):
        # Balance directives will be sorted in front of transactions, so there is no need
        # to have a line number to break ties.
        meta = data.new_metadata(filename, 0)
        balance_entry = data.Balance(meta, date, self.account,
                                     amount.Amount(balance, self.currency),
                                     None, None)
        return balance_entry

[docs]    def parse(self, file):
        """Parse the CSV file.

        Derived classes must implement this method to parse their CSV files.

        Consider using the helper function 'beansoup.importers.csv.parse' to implement
        your custom CSV parser.

        Args:
          file: A cache.FileMemo object.
        Returns:
          A list of Row objects; one object per row.
          The order of the parsed rows is irrelevant; they will be sorted in ascending
          chronological order in a way that agrees with the balance values associated to
          each row. It that is not possible, the balance values will be ignored and the
          importer will be unable to extract balance directive, but will otherwise work
          as expected.
        """
        raise NotImplementedError('Derived classes must implement this method.')


[docs]def parse(file, dialect, parse_row):
    """Parse a CSV file.

    This utility function makes it easy to parse a CSV file format for
    bank or credit card accounts.

    It takes advantage of the ability to cache the file contents, but it
    does not attempt to cache the parsed result. Be careful when you consider
    caching the result of your parser in a cache.FileMemo object; often your
    row parser will adjust the sign of the row balance according to the sign
    of the account associated with the importer using the parser; this means
    that CSV importers for accounts of opposite signs should not share the
    parsed results!

    Args:
      file: A cache.FileMemo object; the CSV file to be parsed.
      dialect: The name of a registered CSV dialect to use for parsing.
      parse_row: A function taking a row (a list of values) and its line number in
        the input file and returning a Row object.
    Returns:
      A list of Row objects in the same order as encountered in the CSV file.
    """
    with io.StringIO(file.contents()) as stream:
        reader = csv.reader(stream, dialect)
        try:
            rows = [parse_row(row, reader.line_num) for row in reader if row]
        except (csv.Error, ValueError) as exc:
            logging.error('{}:{}: {}'.format(file.name, reader.line_num, exc))
            rows = []
    return rows


[docs]def sort_rows(rows):
    """Sort the rows of a CSV file.

    This function can sort the rows of a CSV file in ascending chronological order
    such that the balance values of each row match the sequence of transactions.

    Args:
      rows: A list of objects with a lineno, date, amount, and balance attributes.
    Returns
      A pair with a sorted list of rows and an error. The error is None if the function
      could find an ordering agreeing with the balance values of its rows; otherwise,
      it is the line number in the CSV file corresponding to the first row not agreeing
      with its balance value.
    """
    if len(rows) <= 1:
        return rows, None

    # If there is more than one row sharing the earliest date of the statement, we do not
    # know for sure which one came first, so we have a number of opening balances and we
    # have to find out which one is the right one.
    first_date = rows[0].date
    opening_balances = [row.balance - row.amount for row in itertools.takewhile(
        lambda r: r.date == first_date, rows)]

    error_lineno = 0
    for opening_balance in opening_balances:
        # Given one choice of opening balance, we try to find an ordering of the rows
        # that agrees with the balance amount they show
        stack = list(reversed(rows))
        prev_balance = opening_balance
        unbalanced_rows = []
        balanced_rows = []
        while stack:
            row = stack.pop()
            # Check if the current row balances with the previous one
            if prev_balance + row.amount == row.balance:
                # The current row is in the correct chronological order
                balanced_rows.append(row)
                prev_balance = row.balance
                if unbalanced_rows:
                    # Put unbalanced rows back on the stack so they get another chance
                    stack.extend(unbalanced_rows)
                    unbalanced_rows.clear()
            else:
                # The current row is out of chronological order
                if unbalanced_rows and unbalanced_rows[0].date != row.date:
                    # No ordering can be found that agrees with the
                    # balance values of the rows
                    break
                # Skip the current row for the time being
                unbalanced_rows.append(row)
        if len(balanced_rows) == len(rows):
            return balanced_rows, None
        error_lineno = unbalanced_rows[0].lineno

    # The rows could not be ordered in any way that would agree with the balance values
    return rows, error_lineno