Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Old entry trimmer hook. #127

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions smart_importer/trimmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Old entry trimmer hook.

This hook excludes entries in the import that occurred prior to the last
existing balance entry (that is already in the beancount ledger). Since these
imported transactions should already be accounted for, the output of
bean-extract becomes a lot smaller and easier to read. This is especially useful
for financial institutions that do not allow customizing the date range of
exported transactions.

By default, we only trim entries already marked duplicate (e.g., by
`DuplicateDetector`). If you do not use `DuplicateDetector` or want to trim old
entries regardless of the "duplicate" metadata, set `only_trim_duplicates` to
`False`.
"""

from typing import List

import logging
import datetime

from beancount.core.data import Directive, Balance, Transaction


from smart_importer.hooks import ImporterHook


class OldEntryTrimmer(ImporterHook):
def __init__(self, only_trim_duplicates: bool = True):
self.only_trim_duplicates = only_trim_duplicates

def __call__(
self,
importer,
file,
imported_entries: List[Directive],
existing_entries: List[Directive],
):
balance_entries: List[Balance] = [
entry for entry in existing_entries if isinstance(entry, Balance)
]
balance_dates: List[datetime.date] = [
entry.date for entry in balance_entries
]
last_balance: datetime.date = max(
balance_dates, default=datetime.date.min
)

def should_keep(entry: Directive):
# Always keep non-transactions.
if not isinstance(entry, Transaction):
return True
# Always keep "new" transactions.
if entry.date >= last_balance:
return True
# Always discard duplicates.
if entry.meta.get("__duplicate__", False):
return False
# At this point, we have an old non-duplicate transaction.
return self.only_trim_duplicates

return [entry for entry in imported_entries if should_keep(entry)]
133 changes: 133 additions & 0 deletions tests/trimmer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Tests for the OldEntryTrimmer"""

from typing import List

import copy
import pytest

from beancount.core.data import Directive, Transaction
from beancount.ingest.importer import ImporterProtocol
from beancount.parser import parser

from smart_importer import apply_hooks
from smart_importer.trimmer import OldEntryTrimmer
from smart_importer.detector import DuplicateDetector


existing_entries, _, _ = parser.parse_string(
"""
2016-01-01 open Assets:US:BofA:Checking USD
2016-01-01 open Equity:Initial-Balance:US:BofA:Checking USD
2016-01-01 open Expenses:Food:Groceries USD
2016-01-01 open Expenses:Food:Restaurant USD
2016-01-01 open Expenses:Scams USD

2016-01-01 * "Initial Balance"
Assets:US:BofA:Checking 100 USD
Equity:Initial-Balance:US:BofA:Checking

2016-01-02 balance Assets:US:BofA:Checking 100 USD

2016-01-06 * "Farmer Fresh" "Buying groceries"
Assets:US:BofA:Checking -2.50 USD
Expenses:Food:Groceries

2016-01-07 balance Assets:US:BofA:Checking 97.50 USD

2016-01-07 * "Farmer Fresh" "Groceries"
Assets:US:BofA:Checking -10.20 USD
Expenses:Food:Groceries

2016-01-08 balance Assets:US:BofA:Checking 87.30 USD

2016-01-08 * "Uncle Boons" "Eating out with Joe"
Assets:US:BofA:Checking -38.36 USD
Expenses:Food:Restaurant

2016-01-09 balance Assets:US:BofA:Checking 48.94 USD

2016-01-10 * "Uncle Boons" "Dinner with Mary"
Assets:US:BofA:Checking -35.00 USD
Expenses:Food:Restaurant

"""
)

example_imported_data, _, _ = parser.parse_string(
"""
; 0: old_dup - existing entry before last balance
2016-01-07 * "Farmer Fresh" "Groceries"
Assets:US:BofA:Checking -10.20 USD
Expenses:Food:Groceries

; 1: old_non_dup - new entry before last balance
2016-01-08 * "Scammers" "Car warranty extension"
Assets:US:BofA:Checking -38.36 USD
Expenses:Scams

; 2: new_dup - existing entry after last balance
2016-01-10 * "Uncle Boons" "Dinner with Mary"
Assets:US:BofA:Checking -35.00 USD
Expenses:Food:Restaurant

; 3: new_non_dup - new entry, not seen before
2016-01-11 * "Ye Old Diner" "Lunch with Hasan"
Assets:US:BofA:Checking -27.30 USD
Expenses:Food:Restaurant
"""
)


def _marked_duplicate(entry: Transaction):
entry.meta["__duplicate__"] = True
return entry


old_dup = _marked_duplicate(copy.deepcopy(example_imported_data[0]))
old_non_dup = copy.deepcopy(example_imported_data[1])
new_dup = _marked_duplicate(copy.deepcopy(example_imported_data[2]))
new_non_dup = copy.deepcopy(example_imported_data[3])


class FakeImporter(ImporterProtocol):
def __init__(self, postings: List[Directive]):
self.postings = postings

def extract(self, file, existing_entries=None):
return self.postings

def file_account(self, file):
return "Assets:US:BofA:Checking"


def test_importer_returns_all_data():
importer = apply_hooks(
FakeImporter(example_imported_data),
[DuplicateDetector()],
)
assert importer.extract("foo", existing_entries) == [
old_dup,
old_non_dup,
new_dup,
new_non_dup,
]


def test_trimmer_removes_old_entries_keeps_nondups():
importer = apply_hooks(
FakeImporter(example_imported_data),
[DuplicateDetector(), OldEntryTrimmer(only_trim_duplicates=True)],
)
assert importer.extract("foo", existing_entries) == [
old_non_dup,
new_dup,
new_non_dup,
]


def test_trimmer_removes_all_old_entries():
importer = apply_hooks(
FakeImporter(example_imported_data),
[DuplicateDetector(), OldEntryTrimmer(only_trim_duplicates=False)],
)
assert importer.extract("foo", existing_entries) == [new_dup, new_non_dup]