From 1c705bcc3a758754e03a6fe3c8f902ae5da812d1 Mon Sep 17 00:00:00 2001 From: Francisco Silva Date: Thu, 20 Feb 2025 20:03:15 +0000 Subject: [PATCH] Refactor path handling, changing it to a centralized config file. --- .gitignore | 12 ++++----- stocksense/app/pages/insights.py | 10 +++---- stocksense/config/__init__.py | 36 ++++++++++++++++++++++++- stocksense/config/paths.py | 44 +++++++++++++++++++++++++++++++ stocksense/database/connection.py | 4 +-- stocksense/model/__init__.py | 7 ++++- stocksense/model/model_handler.py | 8 ++---- stocksense/model/portfolio.py | 5 ++-- stocksense/pipeline/etl.py | 13 +++------ stocksense/pipeline/preprocess.py | 4 --- 10 files changed, 102 insertions(+), 41 deletions(-) create mode 100644 stocksense/config/paths.py diff --git a/.gitignore b/.gitignore index ae6297e..56d7ee9 100644 --- a/.gitignore +++ b/.gitignore @@ -158,12 +158,12 @@ cython_debug/ .vscode/ # Data -**/data/cache/* -**/data/raw/* -**/data/interim/* -**/data/processed/* -**/data/database/* -**/data/tests/* +data/cache/* +data/raw/* +data/interim/* +data/processed/* +data/database/* +data/tests/* old/ log/ diff --git a/stocksense/app/pages/insights.py b/stocksense/app/pages/insights.py index 829d83e..cb2bb2f 100644 --- a/stocksense/app/pages/insights.py +++ b/stocksense/app/pages/insights.py @@ -1,16 +1,12 @@ import datetime as dt -from pathlib import Path import pandas as pd import plotly.express as px import streamlit as st +from stocksense.config import PORTFOLIO_DIR from stocksense.database import DatabaseHandler -REPORTS_DIR = Path(__file__).parents[3] / "reports" -SCORES_DIR = REPORTS_DIR / "scores" -PORTFOLIOS_DIR = REPORTS_DIR / "portfolios" - @st.cache_data(show_spinner="Loading stock data...", max_entries=10) def load_stock_data(): @@ -22,7 +18,7 @@ def get_available_portfolios(): """ Get all available portfolio files. """ - portfolio_files = list(PORTFOLIOS_DIR.glob("portfolio_*.xlsx")) + portfolio_files = list(PORTFOLIO_DIR.glob("portfolio_*.xlsx")) dates = [dt.datetime.strptime(f.stem.split("_")[1], "%Y-%m-%d").date() for f in portfolio_files] return sorted(dates, reverse=True) @@ -31,7 +27,7 @@ def load_portfolio(trade_date): """ Load portfolio for a specific trade date. """ - portfolio_file = PORTFOLIOS_DIR / f"portfolio_{trade_date}.xlsx" + portfolio_file = PORTFOLIO_DIR / f"portfolio_{trade_date}.xlsx" if not portfolio_file.exists(): st.error(f"No portfolio found for trade date {trade_date}") return None diff --git a/stocksense/config/__init__.py b/stocksense/config/__init__.py index da3a8d6..34bf7b5 100644 --- a/stocksense/config/__init__.py +++ b/stocksense/config/__init__.py @@ -1,5 +1,39 @@ from .manager import ConfigManager +from .paths import ( + CACHE_DIR, + DATA_DIR, + DATABASE_DIR, + DATABASE_PATH, + FIXTURES_DIR, + INTERIM_DATA_DIR, + MODEL_DIR, + PACKAGE_DIR, + PORTFOLIO_DIR, + PROCESSED_DATA_DIR, + RAW_DATA_DIR, + REPORTS_DIR, + ROOT_DIR, + SCORES_DIR, + TEST_DIR, +) config = ConfigManager() -__all__ = ["ROOT_PATH", "config"] +__all__ = [ + "config", + "CACHE_DIR", + "DATA_DIR", + "DATABASE_DIR", + "DATABASE_PATH", + "FIXTURES_DIR", + "INTERIM_DATA_DIR", + "MODEL_DIR", + "PACKAGE_DIR", + "PORTFOLIO_DIR", + "PROCESSED_DATA_DIR", + "RAW_DATA_DIR", + "REPORTS_DIR", + "ROOT_DIR", + "SCORES_DIR", + "TEST_DIR", +] diff --git a/stocksense/config/paths.py b/stocksense/config/paths.py new file mode 100644 index 0000000..116c099 --- /dev/null +++ b/stocksense/config/paths.py @@ -0,0 +1,44 @@ +from pathlib import Path + +# Base paths +ROOT_DIR = Path(__file__).parents[2] +PACKAGE_DIR = ROOT_DIR / "stocksense" + +# Data paths +DATA_DIR = ROOT_DIR / "data" +RAW_DATA_DIR = DATA_DIR / "raw" +INTERIM_DATA_DIR = DATA_DIR / "interim" +PROCESSED_DATA_DIR = DATA_DIR / "processed" +CACHE_DIR = DATA_DIR / "cache" + +# Database paths +DATABASE_DIR = DATA_DIR / "database" +DATABASE_PATH = DATABASE_DIR / "stock_db.db" + +# Model paths +MODEL_DIR = ROOT_DIR / "models" + +# Report paths +REPORTS_DIR = ROOT_DIR / "reports" +SCORES_DIR = REPORTS_DIR / "scores" +PORTFOLIO_DIR = REPORTS_DIR / "portfolios" + +# Test paths +TEST_DIR = ROOT_DIR / "tests" +FIXTURES_DIR = TEST_DIR / "fixtures" + +# Ensure required directories exist +REQUIRED_DIRS = [ + RAW_DATA_DIR, + INTERIM_DATA_DIR, + PROCESSED_DATA_DIR, + CACHE_DIR, + DATABASE_DIR, + MODEL_DIR, + SCORES_DIR, + PORTFOLIO_DIR, + FIXTURES_DIR, +] + +for directory in REQUIRED_DIRS: + directory.mkdir(parents=True, exist_ok=True) diff --git a/stocksense/database/connection.py b/stocksense/database/connection.py index 3f718bd..8741da4 100644 --- a/stocksense/database/connection.py +++ b/stocksense/database/connection.py @@ -1,11 +1,9 @@ import sqlite3 -from pathlib import Path from sqlite3 import Error from loguru import logger -PACKAGE_DIR = Path(__file__).parents[1] -DATABASE_PATH = PACKAGE_DIR / "data/database/stock_db.db" +from stocksense.config import DATABASE_PATH class DatabaseConnection: diff --git a/stocksense/model/__init__.py b/stocksense/model/__init__.py index 7048164..972e375 100644 --- a/stocksense/model/__init__.py +++ b/stocksense/model/__init__.py @@ -2,4 +2,9 @@ from .portfolio import PortfolioBuilder from .xgboost_model import XGBoostClassifier, XGBoostRegressor -__all__ = ["XGBoostRegressor", "XGBoostClassifier", "ModelHandler", "PortfolioBuilder"] +__all__ = [ + "ModelHandler", + "PortfolioBuilder", + "XGBoostClassifier", + "XGBoostRegressor", +] diff --git a/stocksense/model/model_handler.py b/stocksense/model/model_handler.py index 8702e6b..8db9d0c 100644 --- a/stocksense/model/model_handler.py +++ b/stocksense/model/model_handler.py @@ -1,13 +1,12 @@ import datetime as dt import warnings -from pathlib import Path from typing import List, Optional import numpy as np import polars as pl from loguru import logger -from stocksense.config import config +from stocksense.config import MODEL_DIR, SCORES_DIR, config from .optuna_optimizer import OptunaOptimizer from .utils import ( @@ -18,9 +17,6 @@ ) from .xgboost_model import XGBoostClassifier -MODEL_DIR = Path(__file__).parents[1] / "model" / "model_base" -REPORT_DIR = Path(__file__).parents[2] / "reports" / "scores" - warnings.filterwarnings("ignore") @@ -186,7 +182,7 @@ def _save_scoring_report(self, rank_data: pl.DataFrame) -> None: DataFrame containing ranks for each target and average rank. """ try: - report_file = REPORT_DIR / f"scores_{self.trade_date.date()}.csv" + report_file = SCORES_DIR / f"scores_{self.trade_date.date()}.csv" rank_data.write_csv(report_file) logger.success(f"SAVED scoring report to {report_file}") except Exception as e: diff --git a/stocksense/model/portfolio.py b/stocksense/model/portfolio.py index ec3aa7f..2adaf93 100644 --- a/stocksense/model/portfolio.py +++ b/stocksense/model/portfolio.py @@ -1,11 +1,11 @@ import datetime as dt -from pathlib import Path import numpy as np import pandas as pd import polars as pl from loguru import logger +from stocksense.config import PORTFOLIO_DIR from stocksense.database import DatabaseHandler @@ -30,7 +30,6 @@ def __init__(self, weighting: str = "market_cap"): """ self.weighting = weighting self.db = DatabaseHandler() - self.portfolios_dir = Path(__file__).parents[2] / "reports" / "portfolios" def build_portfolio( self, n_stocks: int, trade_date: dt.datetime, data: pl.DataFrame @@ -193,7 +192,7 @@ def _save_portfolio_excel(self, portfolio: pl.DataFrame, trade_date: dt.datetime trade_date : dt.datetime Trade date. """ - excel_path = self.portfolios_dir / f"portfolio_{trade_date.date()}.xlsx" + excel_path = PORTFOLIO_DIR / f"portfolio_{trade_date.date()}.xlsx" # Convert to pandas once and rename columns portfolio_pd = portfolio.rename({ diff --git a/stocksense/pipeline/etl.py b/stocksense/pipeline/etl.py index 49fd41c..29bd59c 100644 --- a/stocksense/pipeline/etl.py +++ b/stocksense/pipeline/etl.py @@ -7,14 +7,11 @@ from loguru import logger from tqdm import tqdm -from stocksense.config import ConfigManager +from stocksense.config import DATA_DIR, ConfigManager from stocksense.database import DatabaseHandler from .scraper import Scraper -PACKAGE_DIR = Path(__file__).parents[1] -DATA_PATH = PACKAGE_DIR / "data" - class ETL: """ @@ -28,7 +25,7 @@ def __init__(self, config: ConfigManager, stocks: Optional[list[str]] = None): self.db_schema: dict = config.database.db_schema self.base_date: str = config.scraping.base_date self.fin_source: str = "yfinance" - self.historical_data_path: Path = DATA_PATH / "interim" + self.historical_data_path: Path = DATA_DIR / "interim" self._update_index_listings() self.stocks: list[str] = stocks or self._set_default_stocks() @@ -358,7 +355,7 @@ def restore_delisted_stocks_data(self) -> None: ) logger.info(f"Restoring market data for {delisted_stocks}") - prices_file = DATA_PATH / "raw" / "prices_2005-01-01_2018-12-31.csv" + prices_file = DATA_DIR / "raw" / "prices_2005-01-01_2018-12-31.csv" with open(prices_file) as f: data_types = f.readline().strip().split(',')[1:] @@ -411,11 +408,7 @@ def ingest_all_historical_data(self): """ Ingest historical stock data stored in .csv files. """ - - # read snapshot of S&P500 constituents and store in stocks info table self._ingest_stock_list() - - # iterate over stock historical and ingest it base_folder = self.historical_data_path / "company_data" for stock_folder in os.listdir(base_folder): stock_path = base_folder / stock_folder diff --git a/stocksense/pipeline/preprocess.py b/stocksense/pipeline/preprocess.py index 60b2812..d6b89d7 100644 --- a/stocksense/pipeline/preprocess.py +++ b/stocksense/pipeline/preprocess.py @@ -1,5 +1,4 @@ import datetime as dt -from pathlib import Path import numpy as np import polars as pl @@ -9,9 +8,6 @@ from stocksense.config import config from stocksense.database import DatabaseHandler -DATA_PATH = Path(__file__).parents[1] / "data" -FIXTURE_PATH = Path(__file__).parents[2] / "tests" / "fixtures" - def engineer_features() -> pl.DataFrame: """