From 97e2285b69ad9273817d09915f1c9e0ff8d3e5fe Mon Sep 17 00:00:00 2001 From: Francisco Silva Date: Fri, 15 Nov 2024 15:40:57 +0000 Subject: [PATCH] Refactor config file structure and processing pipeline. Lint changes --- README.md | 1 - notebooks/modeling.ipynb | 43 +- pyproject.toml | 5 +- stocksense/app/pages/analytics.py | 35 +- stocksense/app/pages/overview.py | 11 +- stocksense/config/__init__.py | 6 +- .../config/{ => defaults}/db_config.yml | 2 +- .../config/{ => defaults}/model_config.yml | 8 +- .../config/defaults/processing_config.yml | 21 + .../config/{ => defaults}/scraping_config.yml | 0 stocksense/config/definitions.py | 16 - stocksense/config/manager.py | 182 +++++ stocksense/config/path_config.yml | 1 - stocksense/config/processing_config.yml | 9 - stocksense/database_handler/__init__.py | 8 +- stocksense/database_handler/handler.py | 11 +- stocksense/database_handler/queries.py | 8 +- stocksense/main.py | 4 +- stocksense/model/__init__.py | 3 +- stocksense/model/genetic_algorithm.py | 32 +- stocksense/model/model_handler.py | 42 +- stocksense/model/xgboost_model.py | 12 +- stocksense/pipeline/__init__.py | 8 +- stocksense/pipeline/etl.py | 14 +- stocksense/pipeline/preprocess.py | 763 +++++++++--------- stocksense/pipeline/scraper.py | 52 +- tests/test_demo.py | 4 +- 27 files changed, 694 insertions(+), 607 deletions(-) rename stocksense/config/{ => defaults}/db_config.yml (99%) rename stocksense/config/{ => defaults}/model_config.yml (88%) create mode 100644 stocksense/config/defaults/processing_config.yml rename stocksense/config/{ => defaults}/scraping_config.yml (100%) delete mode 100644 stocksense/config/definitions.py create mode 100644 stocksense/config/manager.py delete mode 100644 stocksense/config/path_config.yml delete mode 100644 stocksense/config/processing_config.yml diff --git a/README.md b/README.md index 088e381..640f9c8 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,6 @@ The stock classifier is trained using financial ratios and growth features deriv - **Model Training**: A classifier using GA-XGBoost with features including growth ratios, financial metrics, price momentum, and volatility. - **Streamlit App**: A web-based interface for exploring stock metrics, visualizing growth ratios, and viewing model predictions. - **SQLite Database**: Locally stored market, financials, insider trading and status data for historical and current S&P500 members. -- **Pyproject-based Installation**: Easy setup using `pyproject.toml` for dependency management. ## Installation diff --git a/notebooks/modeling.ipynb b/notebooks/modeling.ipynb index 6be89c2..1173a83 100644 --- a/notebooks/modeling.ipynb +++ b/notebooks/modeling.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -10,18 +10,11 @@ "\n", "import plotly.express as px\n", "import polars as pl\n", - "from config import get_config\n", - "from model import XGBoostModel" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "target_col = \"fperf\"\n", - "date_col = \"tdq\"" + "from config import config\n", + "from model import XGBoostModel\n", + "\n", + "target_col = config.model.target\n", + "date_col = config.model.date_col" ] }, { @@ -96,9 +89,7 @@ } ], "source": [ - "data = pl.read_csv(\n", - " \"../data/1_work_data/processed/proc_2024-11-02.csv\", try_parse_dates=True\n", - ")\n", + "data = pl.read_csv(\"../data/1_work_data/processed/proc_2024-11-02.csv\", try_parse_dates=True)\n", "data.head()" ] }, @@ -108,8 +99,8 @@ "metadata": {}, "outputs": [], "source": [ - "features = get_config(\"model\")[\"features\"]\n", - "df = data.select([pl.col(\"fperf\")] + [pl.col(ratio) for ratio in features]).to_pandas()" + "features = config.model.features\n", + "df = data.select([pl.col(target_col)] + [pl.col(ratio) for ratio in features]).to_pandas()" ] }, { @@ -1215,14 +1206,11 @@ ], "source": [ "data = data.filter(\n", - " (pl.col(\"tdq\") < last_trade_date)\n", - " & (~pl.all_horizontal(pl.col(target_col).is_null()))\n", + " (pl.col(\"tdq\") < last_trade_date) & (~pl.all_horizontal(pl.col(target_col).is_null()))\n", ")\n", "\n", "# filter cols\n", - "aux_cols = [\"datadate\", \"rdq\", \"sector\"] + [\n", - " t for t in get_config(\"model\")[\"targets\"] if t != target_col\n", - "]\n", + "aux_cols = [\"datadate\", \"rdq\", \"sector\"] + [t for t in config.model.targets if t != target_col]\n", "data = data.select([c for c in data.columns if c not in aux_cols])\n", "data.head()" ] @@ -1647,8 +1635,7 @@ "\n", "def get_scale():\n", " scale = round(\n", - " len(train.filter(pl.col(target_col) == 0))\n", - " / len(train.filter(pl.col(target_col) == 1))\n", + " len(train.filter(pl.col(target_col) == 0)) / len(train.filter(pl.col(target_col) == 1))\n", " )\n", "\n", " print(f\"Scale of training data (pos/neg): {scale}\")\n", @@ -1725,7 +1712,7 @@ " \"scale_pos_weight\": scale,\n", " \"eval_metric\": \"logloss\",\n", " \"nthread\": -1,\n", - " \"seed\": get_config(\"model\")[\"seed\"],\n", + " \"seed\": config.model.seed,\n", "}\n", "\n", "rbf_model = XGBClassifier(**params)\n", @@ -2106,9 +2093,7 @@ "\n", "feature_names = [\n", " a + \": \" + str(b)\n", - " for a, b in zip(\n", - " X_train.columns, np.abs(shap_values.values).mean(0).round(2), strict=False\n", - " )\n", + " for a, b in zip(X_train.columns, np.abs(shap_values.values).mean(0).round(2), strict=False)\n", "]\n", "\n", "shap.summary_plot(\n", diff --git a/pyproject.toml b/pyproject.toml index 01db63e..256f24e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,8 @@ dependencies = [ "watchdog", "ipykernel", "shap", - "pre-commit" + "pre-commit", + "pydantic" ] [project.optional-dependencies] @@ -52,7 +53,7 @@ fail_under = 80 where = ["stocksense"] [tool.ruff] -line-length = 88 +line-length = 100 target-version = "py310" [tool.ruff.lint] diff --git a/stocksense/app/pages/analytics.py b/stocksense/app/pages/analytics.py index a9e86ce..fcf8cde 100644 --- a/stocksense/app/pages/analytics.py +++ b/stocksense/app/pages/analytics.py @@ -48,8 +48,7 @@ def load_processed_data(): csv_files = directory_path.glob("*.csv") date_files = [ - (file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")) - for file in csv_files + (file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")) for file in csv_files ] if date_files: most_recent_file = max(date_files, key=lambda x: x[1])[0] @@ -76,14 +75,11 @@ def display_stock_info(stock, info): st.markdown(f"**Sector**: {stock.loc[0, 'sector']}") st.markdown(f"**Last price**: {(info.loc[0, 'curr_price']):.2f} $") st.markdown(f"**Market Cap**: {(info.loc[0, 'market_cap'] / MILLION):.2f} M$") - st.markdown( - f"**Out. Shares**: {(info.loc[0, 'shares_outstanding'] / MILLION):.2f} M" - ) + st.markdown(f"**Out. Shares**: {(info.loc[0, 'shares_outstanding'] / MILLION):.2f} M") st.markdown(f"**Volume**: {(info.loc[0, 'volume'])} M$") st.markdown(f"**Beta**: {(info.loc[0, 'beta']):.3f}") st.markdown( - "**Enterprise Value**: " - f"{(info.loc[0, 'enterprise_value'] / MILLION):.2f} M$" + "**Enterprise Value**: " f"{(info.loc[0, 'enterprise_value'] / MILLION):.2f} M$" ) st.divider() st.markdown(f"**Trailing PE**: {(info.loc[0, 'fiftytwo_wc']):.2f}") @@ -145,10 +141,7 @@ def plot_market_data(df, index_df): col=1, ) - colors = [ - "#27AE60" if dif >= 0 else "#B03A2E" - for dif in df["close"].diff().values.tolist() - ] + colors = ["#27AE60" if dif >= 0 else "#B03A2E" for dif in df["close"].diff().values.tolist()] fig.add_trace( go.Bar(x=df["date"], y=df["volume"], showlegend=False, marker_color=colors), @@ -172,9 +165,7 @@ def plot_financial_data(df): """ col = st.selectbox("Select", df.columns[3:], key="financial") fig = go.Figure() - fig.add_trace( - go.Bar(x=df["rdq"], y=df[col], name=f"{col}", marker_color="orangered") - ) + fig.add_trace(go.Bar(x=df["rdq"], y=df[col], name=f"{col}", marker_color="orangered")) fig.update_layout(template="plotly_dark") st.plotly_chart(fig, use_container_width=True, theme=None) @@ -224,9 +215,7 @@ def plot_insider_data(df): Plots scatter plot for insider trading data. """ - df["value"] = ( - df["value"].replace({r"\$": "", ",": ""}, regex=True).astype(float).abs() - ) + df["value"] = df["value"].replace({r"\$": "", ",": ""}, regex=True).astype(float).abs() fig = px.scatter( df, @@ -249,9 +238,7 @@ def plot_processed_data(df): """ col = st.selectbox("Select", df.columns[15:], key="proc") fig = go.Figure() - fig.add_trace( - go.Bar(x=df["tdq"], y=df[col], name=f"{col}", marker_color="orangered") - ) + fig.add_trace(go.Bar(x=df["tdq"], y=df[col], name=f"{col}", marker_color="orangered")) st.plotly_chart(fig, use_container_width=True) @@ -311,16 +298,12 @@ def main(): with tab1: display_stock_info(stock, info) with tab2: - mdf = market[ - (market["date"] >= start_dates[selected_range]) - & (market["date"] <= max_date) - ] + mdf = market[(market["date"] >= start_dates[selected_range]) & (market["date"] <= max_date)] idf = sp[(sp["date"] >= start_dates[selected_range]) & (sp["date"] <= max_date)] plot_market_data(mdf, idf) with tab3: fdf = financials.loc[ - (financials["rdq"] >= start_dates[selected_range]) - & (financials["rdq"] <= max_date) + (financials["rdq"] >= start_dates[selected_range]) & (financials["rdq"] <= max_date) ] plot_financial_data(fdf) with tab4: diff --git a/stocksense/app/pages/overview.py b/stocksense/app/pages/overview.py index 74260a2..269b46a 100644 --- a/stocksense/app/pages/overview.py +++ b/stocksense/app/pages/overview.py @@ -26,12 +26,7 @@ def load_sp500_data(): financials = db.fetch_financial_data().to_pandas() financials["rdq"] = pd.to_datetime(financials["rdq"]) - financials = ( - financials.sort_values("rdq", ascending=False) - .groupby("tic") - .first() - .reset_index() - ) + financials = financials.sort_values("rdq", ascending=False).groupby("tic").first().reset_index() stock_df = stock_df.merge(financials, how="left", on="tic") return stock_df @@ -72,9 +67,7 @@ def show_recent_earnings(data): "tic": "Stock", "rdq": st.column_config.DateColumn("Earnings Date", format="YYYY-MM-DD"), "sector": "Sector", - "curr_price": st.column_config.NumberColumn( - "Current Price", format="$%.2f" - ), + "curr_price": st.column_config.NumberColumn("Current Price", format="$%.2f"), "saleq": st.column_config.NumberColumn("Sales", format="$%.2f"), "surprise_pct": st.column_config.NumberColumn("Surprise %", format="$%.2f"), }, diff --git a/stocksense/config/__init__.py b/stocksense/config/__init__.py index 3da7268..212b9cc 100644 --- a/stocksense/config/__init__.py +++ b/stocksense/config/__init__.py @@ -1,3 +1,5 @@ -from config.definitions import ROOT_PATH, get_config +from config.manager import ConfigManager -__all__ = ["ROOT_PATH", "get_config"] +config = ConfigManager() + +__all__ = ["ROOT_PATH", "config"] diff --git a/stocksense/config/db_config.yml b/stocksense/config/defaults/db_config.yml similarity index 99% rename from stocksense/config/db_config.yml rename to stocksense/config/defaults/db_config.yml index 4f0c372..e8c1613 100644 --- a/stocksense/config/db_config.yml +++ b/stocksense/config/defaults/db_config.yml @@ -1,4 +1,4 @@ -'schema': +'db_schema': 'stock': - 'tic' - 'name' diff --git a/stocksense/config/model_config.yml b/stocksense/config/defaults/model_config.yml similarity index 88% rename from stocksense/config/model_config.yml rename to stocksense/config/defaults/model_config.yml index a19ba84..39222ff 100644 --- a/stocksense/config/model_config.yml +++ b/stocksense/config/defaults/model_config.yml @@ -3,7 +3,9 @@ - 'n_purch' - 'n_sales' - 'insider_balance' - # growth, momentum and volatility features + # market momentum and volatility features + - 'volume_ma20' + - 'volume_ma50' - 'price_mom' - 'price_qoq' - 'price_yoy' @@ -23,6 +25,9 @@ - 'momentum_qoq' - 'momentum_yoy' - 'momentum_2y' + - 'fear_ma30' + - 'high_fear' + - 'low_fear' # financial features - 'gpm' - 'roa' @@ -46,6 +51,7 @@ - 'pb' - 'ps' - 'ev_ebitda' + - 'f_score' # growth features - 'saleq_yoy' - 'saleq_2y' diff --git a/stocksense/config/defaults/processing_config.yml b/stocksense/config/defaults/processing_config.yml new file mode 100644 index 0000000..e129d09 --- /dev/null +++ b/stocksense/config/defaults/processing_config.yml @@ -0,0 +1,21 @@ +'two_week_trading_days': 10 +'month_trading_days': 21 +'quarter_trading_days': 61 +'semester_trading_days': 126 +'year_trading_days': 252 +'two_year_trading_days': 504 +'prediction_horizon': 4 +'over_performance_threshold': 0.1 +'performance_threshold': 0.4 +'sectors': + - 'Health Care' + - 'Financials' + - 'Industrials' + - 'Consumer Discretionary' + - 'Information Technology' + - 'Communication Services' + - 'Consumer Staples' + - 'Utilities' + - 'Real Estate' + - 'Materials' + - 'Energy' diff --git a/stocksense/config/scraping_config.yml b/stocksense/config/defaults/scraping_config.yml similarity index 100% rename from stocksense/config/scraping_config.yml rename to stocksense/config/defaults/scraping_config.yml diff --git a/stocksense/config/definitions.py b/stocksense/config/definitions.py deleted file mode 100644 index fa4ee6a..0000000 --- a/stocksense/config/definitions.py +++ /dev/null @@ -1,16 +0,0 @@ -import os.path - -import yaml - -ROOT_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) - -""" -Get project root path and publish as a global variable. -""" - - -def get_config(config_file: str) -> dict: - with open( - os.path.join(ROOT_PATH, f"config/{config_file}_config.yml"), encoding="utf8" - ) as file: - return yaml.safe_load(file) diff --git a/stocksense/config/manager.py b/stocksense/config/manager.py new file mode 100644 index 0000000..9ec95cc --- /dev/null +++ b/stocksense/config/manager.py @@ -0,0 +1,182 @@ +from datetime import datetime +from pathlib import Path +from typing import Dict, List + +import yaml +from pydantic import ( + BaseModel, + Field, + field_validator, + model_validator, +) + + +class ScrapingConfig(BaseModel): + base_date: str = Field(description="Starting date for data collection in YYYY-MM-DD format") + crsp_columns: List[str] = Field(min_length=1) + macrotrends: Dict[str, Dict[str, str]] + yahoo: Dict[str, str] + yahoo_info: Dict[str, str] + + @property + def start_date(self) -> datetime: + """Convert base_date string to datetime object.""" + return datetime.strptime(self.base_date, "%Y-%m-%d") + + +class ProcessingConfig(BaseModel): + trading_days_2week: int = Field(alias="two_week_trading_days") + trading_days_month: int = Field(alias="month_trading_days") + trading_days_quarter: int = Field(alias="quarter_trading_days") + trading_days_semester: int = Field(alias="semester_trading_days") + trading_days_year: int = Field(alias="year_trading_days") + trading_days_2year: int = Field(alias="two_year_trading_days") + prediction_horizon: int = Field(gt=0) + over_performance_threshold: float = Field( + gt=0.0, lt=1.0, description="Overperformance threshold" + ) + performance_threshold: float = Field(gt=0.0, lt=1.0, description="Performance threshold") + sectors: List[str] = Field(min_length=11, max_length=11) + + @model_validator(mode="after") + def validate_trading_days_order(self) -> "ProcessingConfig": + """Validate that trading days are in ascending order.""" + if not ( + self.trading_days_2week + < self.trading_days_month + < self.trading_days_quarter + < self.trading_days_semester + < self.trading_days_year + < self.trading_days_2year + ): + raise ValueError("Trading days must be in ascending order") + return self + + @property + def trading_days(self) -> Dict[str, int]: + """Get all trading day periods in a dictionary.""" + return { + "2week": self.trading_days_2week, + "month": self.trading_days_month, + "quarter": self.trading_days_quarter, + "semester": self.trading_days_semester, + "year": self.trading_days_year, + "2year": self.trading_days_2year, + } + + @property + def performance_thresholds(self) -> Dict[str, float]: + """Get all performance thresholds in a dictionary.""" + return { + "over_performance": self.over_performance_threshold, + "performance": self.performance_threshold, + } + + +class ModelConfig(BaseModel): + features: List[str] + target: str + id_col: str + date_col: str + train_start: int = Field(ge=2005, le=datetime.now().year) + train_window: int = Field(gt=0) + val_window: int = Field(gt=0) + seed: int = Field(ge=0) + + @model_validator(mode="after") + def validate_column_names(self) -> "ModelConfig": + """Validate that target, id_col and date_col are not in features.""" + special_cols = [self.target, self.id_col, self.date_col] + if any(col in self.features for col in special_cols): + raise ValueError("features list cannot contain target, id_col or date_col") + return self + + @model_validator(mode="after") + def validate_window_sizes(self) -> "ModelConfig": + """Validate that train_start + train_window + val_window is less than the current year.""" + if self.train_start + self.train_window + self.val_window > datetime.now().year - 1: + raise ValueError("Window size overflow") + return self + + @property + def feature_count(self) -> int: + """Get the number of features.""" + return len(self.features) + + @property + def window_sizes(self) -> Dict[str, int]: + """Get all window sizes in a dictionary.""" + return { + "train": self.train_window, + "validation": self.val_window, + } + + +class DatabaseConfig(BaseModel): + db_schema: Dict[str, List[str]] + + @field_validator("db_schema") + @classmethod + def validate_schema_structure(cls, v: Dict[str, List[str]]) -> Dict[str, List[str]]: + """Validate that each table has at least one column.""" + for table, columns in v.items(): + if not columns: + raise ValueError(f"Table '{table}' must have at least one column") + return v + + @property + def tables(self) -> List[str]: + """Get list of all tables in the database.""" + return list(self.db_schema.keys()) + + def get_columns(self, table: str) -> List[str]: + """Get columns for a specific table.""" + if table not in self.db_schema: + raise ValueError(f"Table '{table}' not found in schema") + return self.db_schema[table] + + +class ConfigManager: + def __init__(self): + self.config_path = Path(__file__).parent / "defaults" + self.scraping: ScrapingConfig = None + self.processing: ProcessingConfig = None + self.model: ModelConfig = None + self.database: DatabaseConfig = None + self._load_configs() + + def _load_configs(self): + self.scraping = ScrapingConfig(**self._get_config("scraping")) + self.processing = ProcessingConfig(**self._get_config("processing")) + self.model = ModelConfig(**self._get_config("model")) + self.database = DatabaseConfig(**self._get_config("db")) + + def _get_config(self, config_file: str) -> dict: + config_path = self.config_path / f"{config_file}_config.yml" + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(config_path, encoding="utf8") as file: + try: + return yaml.safe_load(file) + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML in {config_file}_config.yml: {e}") from e + + @property + def trading_days(self) -> Dict[str, int]: + """Shortcut to access trading days configuration.""" + return self.processing.trading_days + + @property + def features(self) -> List[str]: + """Shortcut to access model features.""" + return self.model.features + + @property + def tables(self) -> List[str]: + """Shortcut to access database tables.""" + return self.database.tables + + def get_table_columns(self, table: str) -> List[str]: + """Shortcut to get columns for a specific table.""" + return self.database.get_columns(table) diff --git a/stocksense/config/path_config.yml b/stocksense/config/path_config.yml deleted file mode 100644 index 464f69d..0000000 --- a/stocksense/config/path_config.yml +++ /dev/null @@ -1 +0,0 @@ -"logs" diff --git a/stocksense/config/processing_config.yml b/stocksense/config/processing_config.yml deleted file mode 100644 index f78ce8d..0000000 --- a/stocksense/config/processing_config.yml +++ /dev/null @@ -1,9 +0,0 @@ -'2week_trading_days': 10 -'month_trading_days': 21 -'quarter_trading_days': 61 -'semester_trading_days': 126 -'year_trading_days': 252 -'2year_trading_days': 504 -'prediction_horizon': 4 -'over_performance_threshold': 0.1 -'performance_threshold': 0.4 diff --git a/stocksense/database_handler/__init__.py b/stocksense/database_handler/__init__.py index b869b16..91f1d17 100644 --- a/stocksense/database_handler/__init__.py +++ b/stocksense/database_handler/__init__.py @@ -1,6 +1,6 @@ -from database_handler.connection import DatabaseConnection -from database_handler.handler import DatabaseHandler -from database_handler.queries import ( +from .connection import DatabaseConnection +from .handler import DatabaseHandler +from .queries import ( count_data, delete_data, delete_table, @@ -10,7 +10,7 @@ insert_record, update_data, ) -from database_handler.schema import create_tables +from .schema import create_tables __all__ = [ "DatabaseConnection", diff --git a/stocksense/database_handler/handler.py b/stocksense/database_handler/handler.py index 0b07d8f..2dbd2e1 100644 --- a/stocksense/database_handler/handler.py +++ b/stocksense/database_handler/handler.py @@ -3,10 +3,10 @@ import numpy as np import polars as pl -from database_handler import ( - DatabaseConnection, + +from .connection import DatabaseConnection +from .queries import ( count_data, - create_tables, delete_data, delete_table, fetch_data, @@ -14,6 +14,7 @@ insert_record, update_data, ) +from .schema import create_tables sqlite3.register_adapter(np.int32, int) sqlite3.register_adapter(np.int64, int) @@ -147,6 +148,4 @@ def convert_date_columns_to_str(df, cols, date_format="%Y-%m-%d") -> pl.DataFram def convert_str_columns_to_date(df, cols, date_format="%Y-%m-%d") -> pl.DataFrame: - return df.with_columns( - [pl.col(col).str.to_date(format=date_format) for col in cols] - ) + return df.with_columns([pl.col(col).str.to_date(format=date_format) for col in cols]) diff --git a/stocksense/database_handler/queries.py b/stocksense/database_handler/queries.py index af55cf0..48ceb0d 100644 --- a/stocksense/database_handler/queries.py +++ b/stocksense/database_handler/queries.py @@ -112,9 +112,7 @@ def fetch_record( row = cursor.fetchone() if row: columns = [description[0] for description in cursor.description] - return pl.DataFrame( - [row], schema=columns, orient="row", infer_schema_length=None - ) + return pl.DataFrame([row], schema=columns, orient="row", infer_schema_length=None) else: return None except Error as e: @@ -137,9 +135,7 @@ def fetch_data( columns = [description[0] for description in cursor.description] data = cursor.fetchall() if data: - return pl.DataFrame( - data, schema=columns, orient="row", infer_schema_length=None - ) + return pl.DataFrame(data, schema=columns, orient="row", infer_schema_length=None) else: return None except Error as e: diff --git a/stocksense/main.py b/stocksense/main.py index 8740bd8..17a45da 100644 --- a/stocksense/main.py +++ b/stocksense/main.py @@ -1,6 +1,6 @@ import click from model import ModelHandler -from pipeline import ETL, Preprocess +from pipeline import ETL, process_stock_data @click.command() @@ -17,7 +17,7 @@ def main(update, train, score): etl_handler.update_index_listings() etl_handler.extract() if train: - data = Preprocess().run() + data = process_stock_data() handler = ModelHandler() handler.train(data) if score: diff --git a/stocksense/model/__init__.py b/stocksense/model/__init__.py index 0e32699..fd411d5 100644 --- a/stocksense/model/__init__.py +++ b/stocksense/model/__init__.py @@ -1,5 +1,4 @@ -from model.genetic_algorithm import GeneticAlgorithm from model.model_handler import ModelHandler from model.xgboost_model import XGBoostModel -__all__ = ["XGBoostModel", "GeneticAlgorithm", "ModelHandler"] +__all__ = ["XGBoostModel", "ModelHandler"] diff --git a/stocksense/model/genetic_algorithm.py b/stocksense/model/genetic_algorithm.py index 141ce49..d18a029 100644 --- a/stocksense/model/genetic_algorithm.py +++ b/stocksense/model/genetic_algorithm.py @@ -2,9 +2,10 @@ import polars as pl import pygad -from config import get_config +from config import config from loguru import logger -from model import XGBoostModel + +from .xgboost_model import XGBoostModel class GeneticAlgorithm: @@ -31,7 +32,7 @@ def __init__( self.best_fitness_value = 0 self.no_improv_count = 0 self.no_improv_limit = 5 - self.random_seed = get_config("model")["seed"] + self.random_seed = config.model.seed def create_instance(self): logger.info("creating GA instance") @@ -59,9 +60,7 @@ def on_generation(self, ga_instance): Callback function. """ - best_solution, best_solution_fitness, best_solution_idx = ( - ga_instance.best_solution() - ) + best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution() logger.info(f"generation {ga_instance.generations_completed}:") logger.info(f"\tbest solution: {best_solution}") logger.info(f"\tbest fitness: {best_solution_fitness}") @@ -73,16 +72,12 @@ def on_generation(self, ga_instance): self.no_improv_count += 1 if self.no_improv_count >= self.no_improv_limit: - print( - f"no improvement for {self.no_improv_limit} generations, stopping GA." - ) + print(f"no improvement for {self.no_improv_limit} generations, stopping GA.") ga_instance.terminate() def train(self): if self.ga_instance is None: - raise Exception( - "GA instance is not created. Call create_instance() before training." - ) + raise Exception("GA instance is not created. Call create_instance() before training.") self.ga_instance.run() def best_solution(self): @@ -96,15 +91,12 @@ def best_solution(self): def plot_fitness(self): if self.ga_instance is None: raise Exception( - "GA instance is not created. " - "Call create_instance() before plotting fitness." + "GA instance is not created. " "Call create_instance() before plotting fitness." ) self.ga_instance.plot_fitness() -def get_train_val_split( - data: pl.DataFrame, start_year: int, train_window: int, val_window: int -): +def get_train_val_split(data: pl.DataFrame, start_year: int, train_window: int, val_window: int): """ Split the dataset into training and validation sets, based on walk-forward validation strategy. @@ -155,7 +147,7 @@ def fitness_function(ga_instance, solution, solution_idx): "scale_pos_weight": scale, "eval_metric": "logloss", "nthread": -1, - "seed": get_config("model")["seed"], + "seed": config.model.seed, } model = XGBoostModel(params) @@ -163,9 +155,7 @@ def fitness_function(ga_instance, solution, solution_idx): window = train_window while start_year + window + val_window < dt.datetime.now().year - 1: train, val = get_train_val_split(data, start_year, window, val_window) - X_train = train.select( - pl.exclude([tic_col, target_col, date_col]) - ).to_pandas() + X_train = train.select(pl.exclude([tic_col, target_col, date_col])).to_pandas() y_train = train.select(target_col).to_pandas().values.ravel() X_val = val.select(pl.exclude([tic_col, target_col, date_col])).to_pandas() y_val = val.select(target_col).to_pandas().values.ravel() diff --git a/stocksense/model/model_handler.py b/stocksense/model/model_handler.py index 312ffc6..f10ad98 100644 --- a/stocksense/model/model_handler.py +++ b/stocksense/model/model_handler.py @@ -3,7 +3,7 @@ from pathlib import Path import polars as pl -from config import get_config +from config import config from loguru import logger from .genetic_algorithm import GeneticAlgorithm, fitness_function_wrapper @@ -22,14 +22,13 @@ class ModelHandler: """ def __init__(self): - model_settings = get_config("model") - self.id_col = model_settings["id_col"] - self.date_col = model_settings["date_col"] - self.target_col = model_settings["target"] - self.train_start = model_settings["train_start"] - self.train_window = model_settings["train_window"] - self.val_window = model_settings["val_window"] - self.seed = model_settings["seed"] + self.id_col = config.model.id_col + self.date_col = config.model.date_col + self.target_col = config.model.target + self.train_start = config.model.train_start + self.train_window = config.model.train_window + self.val_window = config.model.val_window + self.seed = config.model.seed def train(self, data: pl.DataFrame): """ @@ -41,24 +40,15 @@ def train(self, data: pl.DataFrame): Preprocessed financial data. """ try: - if ( - self.train_start + self.train_window + self.val_window - > dt.datetime.now().year - 1 - ): - raise Exception("Window size overflow") - trade_date = find_last_trading_date() logger.info(f"START training model - {trade_date}") train_df = data.filter( - (pl.col("tdq") < trade_date) - & ~pl.all_horizontal(pl.col(self.target_col).is_null()) + (pl.col("tdq") < trade_date) & ~pl.all_horizontal(pl.col(self.target_col).is_null()) ) scale = self.get_dataset_imbalance_scale(train_df) aux_cols = ["datadate", "rdq", "sector"] - train_df = train_df.select( - [c for c in train_df.columns if c not in aux_cols] - ) + train_df = train_df.select([c for c in train_df.columns if c not in aux_cols]) ga = GeneticAlgorithm( num_generations=50, @@ -163,19 +153,13 @@ def score(self, data): logger.info(f"START stocksense eval - {trade_date}") test_df = data.filter((pl.col("tdq") == trade_date)) - test_df = test_df.filter( - ~pl.all_horizontal(pl.col(self.target_col).is_null()) - ) + test_df = test_df.filter(~pl.all_horizontal(pl.col(self.target_col).is_null())) aux_cols = ["datadate", "rdq", "tic", "sector", "freturn", "adj_freturn"] test_df = test_df.select([c for c in test_df.columns if c not in aux_cols]) - test_df = test_df.select( - pl.exclude([self.target_col, self.date_col]) - ).to_pandas() + test_df = test_df.select(pl.exclude([self.target_col, self.date_col])).to_pandas() - model_path = model_path = ( - Path("models/") / f"xgb_{self.last_trade_date}.pkl" - ) + model_path = model_path = Path("models/") / f"xgb_{self.last_trade_date}.pkl" model = XGBoostModel().load_model(model_path) model.predict_proba(test_df) except Exception: diff --git a/stocksense/model/xgboost_model.py b/stocksense/model/xgboost_model.py index 4292fc4..512d411 100644 --- a/stocksense/model/xgboost_model.py +++ b/stocksense/model/xgboost_model.py @@ -2,7 +2,7 @@ import sklearn.metrics as skm import xgboost as xgb -from config import get_config +from config import config class XGBoostModel: @@ -24,7 +24,7 @@ def __init__(self, params=None, scale=1.0): "scale_pos_weight": scale, "eval_metric": "logloss", "nthread": -1, - "seed": get_config("model")["seed"], + "seed": config.model.seed, } ) self.model = None @@ -35,16 +35,12 @@ def train(self, X_train, y_train): def predict(self, X): if self.model is None: - raise Exception( - "Model is not trained yet. Train the model before predicting." - ) + raise Exception("Model is not trained yet. Train the model before predicting.") return self.model.predict(X) def predict_proba(self, X): if self.model is None: - raise Exception( - "Model is not trained yet. Train the model before predicting." - ) + raise Exception("Model is not trained yet. Train the model before predicting.") return self.model.predict_proba(X)[:, 1] def evaluate(self, X_test, y_test): diff --git a/stocksense/pipeline/__init__.py b/stocksense/pipeline/__init__.py index b8aa5e7..3fcd0a1 100644 --- a/stocksense/pipeline/__init__.py +++ b/stocksense/pipeline/__init__.py @@ -1,5 +1,5 @@ -from pipeline.etl import ETL -from pipeline.preprocess import Preprocess -from pipeline.scraper import Scraper +from .etl import ETL +from .preprocess import process_stock_data +from .scraper import Scraper -__all__ = ["Scraper", "ETL", "Preprocess"] +__all__ = ["Scraper", "ETL", "process_stock_data"] diff --git a/stocksense/pipeline/etl.py b/stocksense/pipeline/etl.py index 2d1cc7b..d8bf676 100644 --- a/stocksense/pipeline/etl.py +++ b/stocksense/pipeline/etl.py @@ -4,7 +4,7 @@ from typing import Optional import polars as pl -from config import get_config +from config import config from database_handler import DatabaseHandler from loguru import logger from tqdm import tqdm @@ -24,8 +24,8 @@ class ETL: def __init__(self, stocks: Optional[list[str]] = None): self.db = DatabaseHandler() - self.db_schema = get_config("db")["schema"] - self.base_date = get_config("scraping")["base_date"] + self.db_schema = config.database.db_schema + self.base_date = config.scraping.base_date self.fin_source = "yfinance" self.historical_data_path = DATA_PATH / "interim" self.stocks = stocks or self._set_default_stocks() @@ -72,9 +72,7 @@ def update_index_listings(self) -> None: self._update_delisted_symbols(stock_df, active_df) self._add_new_symbols(stock_df, active_df) - def _update_delisted_symbols( - self, stock_df: pl.DataFrame, active_df: pl.DataFrame - ) -> None: + def _update_delisted_symbols(self, stock_df: pl.DataFrame, active_df: pl.DataFrame) -> None: """ Downgrade delisted symbols from S&P500. @@ -242,9 +240,7 @@ def extract_info(self, tic: str, scraper: Scraper) -> bool: logger.error(f"{tic}: info extraction FAILED") return False - def extract_fundamental_data( - self, tic: str, scraper: Scraper, last_update: dt.date - ) -> bool: + def extract_fundamental_data(self, tic: str, scraper: Scraper, last_update: dt.date) -> bool: """ Extract financial statement data and update database financials table. diff --git a/stocksense/pipeline/preprocess.py b/stocksense/pipeline/preprocess.py index d85a9ba..9d20deb 100644 --- a/stocksense/pipeline/preprocess.py +++ b/stocksense/pipeline/preprocess.py @@ -4,211 +4,69 @@ import numpy as np import polars as pl import polars_talib as plta -from config import get_config +from config import config from database_handler import DatabaseHandler from loguru import logger -CONFIG = get_config("processing") PACKAGE_DIR = Path(__file__).parents[1] DATA_PATH = PACKAGE_DIR / "data" -# TODO: compute volatility and risk features, compute Piotroski F-Score +# TODO: compute Piotroski F-Score -class Preprocess: +def process_stock_data() -> pl.DataFrame: """ - Stock data processing pipeline handler. + Runs main data processing pipeline. """ + logger.info("START processing stock data") + db = DatabaseHandler() - def __init__(self): - self.db = DatabaseHandler() - self.index_data = self._process_index_data() + try: + # fetch all required data + df = db.fetch_financial_data() + info = db.fetch_stock() + market_df = db.fetch_market_data().sort(["tic", "date"]) + insider_df = db.fetch_insider_data() + index_data = db.fetch_index_data() + vix_data = db.fetch_vix_data() - def run(self): - """ - Runs main data processing pipeline. - """ - logger.info("START processing stock data") - data = self.feature_engineering() - data = self.clean_data(data) - logger.success("END processing stock data") - return data - - def _process_index_data(self) -> pl.DataFrame: - """ - Process S&P500 index price data. - - Returns - ------- - pl.DataFrame - Processed data, incl. forward and past return rates. - """ - logger.info("START processing S&P500 index data") - - index_df = self.db.fetch_index_data() - if index_df.is_empty(): - raise ValueError("Empty index data received") - - index_df = index_df.sort(by=["date"]) - - # compute index past returns - index_df = index_df.with_columns( - pl.col("close").pct_change(CONFIG["month_trading_days"]).alias("index_mom"), - pl.col("close") - .pct_change(CONFIG["quarter_trading_days"]) - .alias("index_qoq"), - pl.col("close") - .pct_change(CONFIG["semester_trading_days"]) - .alias("index_sos"), - pl.col("close").pct_change(CONFIG["year_trading_days"]).alias("index_yoy"), - pl.col("close").pct_change(CONFIG["2year_trading_days"]).alias("index_2y"), - ) - - # compute volatily of index - index_df = index_df.with_columns( - (pl.col("close") / pl.col("close").shift(1)).log().alias("log_return") - ).with_columns( - pl.col("log_return") - .rolling_std(CONFIG["month_trading_days"]) - .alias("index_vol_mom"), - pl.col("log_return") - .rolling_std(CONFIG["quarter_trading_days"]) - .alias("index_vol_qoq"), - pl.col("log_return") - .rolling_std(CONFIG["semester_trading_days"]) - .alias("index_vol_sos"), - pl.col("log_return") - .rolling_std(CONFIG["year_trading_days"]) - .alias("index_vol_yoy"), - pl.col("log_return") - .rolling_std(CONFIG["2year_trading_days"]) - .alias("index_vol_2y"), - ) - - index_df = index_df.rename( - { - "date": "index_date", - "close": "index_close", - "adj_close": "index_adj_close", - } - ) - - logger.success(f"S&P500 index data {index_df.shape[0]} rows PROCESSED") - - return index_df.select( - [ - "index_date", - "index_close", - "index_adj_close", - "index_mom", - "index_qoq", - "index_sos", - "index_yoy", - "index_2y", - "index_vol_mom", - "index_vol_qoq", - "index_vol_sos", - "index_vol_yoy", - "index_vol_2y", - ] - ) - - def feature_engineering(self) -> pl.DataFrame: - """ - Compute financial ratios and features for training. - """ + # feature engineering logger.info("START feature engineering") - - # fetch data - df = self.db.fetch_financial_data() - info = self.db.fetch_stock() - market_df = self.db.fetch_market_data().sort(["tic", "date"]) - insider_df = self.db.fetch_insider_data() - - # compute all features df = compute_trade_date(df) df = adjust_shares(df) df = compute_insider_trading_features(df, insider_df) - df = compute_financial_ratios(df) - df = compute_market_ratios(df, market_df, self.index_data) - df = compute_growth_ratios(df) + df = compute_financial_features(df) + df = compute_sp500_features(df, index_data) + df = compute_vix_features(df, vix_data) + df = compute_market_features(df, market_df, index_data) + df = compute_growth_features(df) + df = compute_piotroski_score(df) df = compute_performance_targets(df) df = df.join(info.select(["tic", "sector"]), on="tic", how="left") + df = clean_data(df) - logger.success(f"{df.shape[1]} features PROCESSED") - return df - - def clean_data(self, data: pl.DataFrame) -> pl.DataFrame: - """ - Clean and process financial features dataset. - - Parameters - ---------- - data : pl.DataFrame - Financial features dataset. - - Returns - ------- - pl.DataFrame - Filtered and processed data. - """ - - logger.info("START cleaning data") - - df = data.filter(pl.col("tdq") <= pl.lit(dt.datetime.today().date())) - growth_alias = ["qoq", "yoy", "2y", "return"] - growth_vars = [f for f in df.columns if any(xf in f for xf in growth_alias)] - df = df.filter(~pl.all_horizontal(pl.col("niq_2y").is_null())) - df = df.filter( - pl.col("sector").is_in( - [ - "Health Care", - "Financials", - "Industrials", - "Consumer Discretionary", - "Information Technology", - "Communication Services", - "Consumer Staples", - "Utilities", - "Real Estate", - "Materials", - "Energy", - ] - ) - ) - - for feature in [f for f in df.columns if any(xf in f for xf in growth_vars)]: - df = df.with_columns(df.with_columns(pl.col(feature).clip(-30, 30))) + logger.success(f"END {df.shape[1]} features PROCESSED") + except Exception as e: + logger.error(f"FAILED processing stock data: {e}") + raise e + return df - float_cols = df.select(pl.col(pl.Float64)).columns - df = df.with_columns( - [ - pl.col(col) - .replace(float("inf"), float("nan")) - .replace(float("-inf"), float("nan")) - .alias(col) - for col in float_cols - ] - ) - df = df.with_columns([pl.col("freturn") * 100, pl.col("adj_freturn") * 100]) - df = df.to_dummies(columns=["sector"]) +def compute_trade_date(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute trade date intervals, to be used as a proxy for quarters. + These represent the financial observations in which models will be trained on. + """ - logger.success(f"{df.shape[0]} rows retained after CLEANING") - return df + min_year = df["rdq"].dt.year().min() + max_year = df["rdq"].dt.year().max() - def save_data(self, data: pl.DataFrame) -> None: - """ - Saves processed data locally. + quarter_dates = generate_quarter_dates(min_year, max_year) + quarter_df = pl.DataFrame({"tdq": quarter_dates}).with_columns(pl.col("tdq").dt.date()) - Parameters - ---------- - data : pl.DataFrame - Filtered and processed data. - """ - today = dt.datetime.today().date() - file_name = f"data/processed/proc_{today}.csv" - data.write_csv(file_name) + df = df.sort(by=["rdq", "tic"]) + df = df.join_asof(quarter_df, left_on="rdq", right_on="tdq", strategy="forward") + return df.sort(by=["tic", "rdq"]) def generate_quarter_dates(start_year: int, end_year: int) -> list: @@ -228,28 +86,8 @@ def generate_quarter_dates(start_year: int, end_year: int) -> list: return quarter_end_dates -def compute_trade_date(df: pl.DataFrame) -> pl.DataFrame: - """ - Compute trade dates. - """ - - min_year = df["rdq"].dt.year().min() - max_year = df["rdq"].dt.year().max() - - quarter_dates = generate_quarter_dates(min_year, max_year) - quarter_df = pl.DataFrame({"tdq": quarter_dates}).with_columns( - pl.col("tdq").dt.date() - ) - - df = df.sort(by=["rdq", "tic"]) - df = df.join_asof(quarter_df, left_on="rdq", right_on="tdq", strategy="forward") - return df.sort(by=["tic", "rdq"]) - - def map_to_closest_split_factor(approx_factor: float) -> float: - common_split_ratios = np.array( - [1, 0.5, 0.33, 0.25, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30] - ) + common_split_ratios = np.array([1, 0.5, 0.33, 0.25, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30]) diffs = np.abs(common_split_ratios - approx_factor) closest_index = np.argmin(diffs) return common_split_ratios[closest_index] @@ -275,9 +113,7 @@ def adjust_shares(df: pl.DataFrame) -> pl.DataFrame: (pl.col("cshoq") / pl.col("cshoq").shift()).over("tic").alias("csho_ratio") ) df = df.with_columns( - ((pl.col("cshoq") / pl.col("cshoq").shift() - 1) > 0.25) - .over("tic") - .alias("stock_split") + ((pl.col("cshoq") / pl.col("cshoq").shift() - 1) > 0.25).over("tic").alias("stock_split") ) df = df.with_columns(pl.col("stock_split").fill_null(False)) df = df.with_columns( @@ -305,26 +141,18 @@ def adjust_shares(df: pl.DataFrame) -> pl.DataFrame: ) # apply the cumulative adjustment to the financial data - df = df.with_columns( - (pl.col("cshoq") * pl.col("cum_adjustment_factor")).alias("cshoq") - ) + df = df.with_columns((pl.col("cshoq") * pl.col("cum_adjustment_factor")).alias("cshoq")) df = df.sort(by=["tic", "tdq"]) - return df.drop( - ["csho_ratio", "split_factor", "adjustment_factor", "cum_adjustment_factor"] - ) + return df.drop(["csho_ratio", "split_factor", "adjustment_factor", "cum_adjustment_factor"]) -def compute_insider_purchases( - df: pl.DataFrame, insider_df: pl.DataFrame -) -> pl.DataFrame: +def compute_insider_purchases(df: pl.DataFrame, insider_df: pl.DataFrame) -> pl.DataFrame: """ Computes insider purchases quarterly features. """ - insider_filtered_lazy = insider_df.lazy().filter( - (pl.col("transaction_type") == "P - Purchase") - ) + insider_filtered_lazy = insider_df.lazy().filter((pl.col("transaction_type") == "P - Purchase")) df_lazy = df.lazy() df_filtered_lazy = df_lazy.join( @@ -358,8 +186,7 @@ def compute_insider_sales(df: pl.DataFrame, insider_df: pl.DataFrame) -> pl.Data """ insider_filtered_lazy = insider_df.lazy().filter( - (pl.col("transaction_type") == "S - Sale") - | (pl.col("transaction_type") == "S - Sale+OE") + (pl.col("transaction_type") == "S - Sale") | (pl.col("transaction_type") == "S - Sale+OE") ) df_lazy = df.lazy() @@ -390,9 +217,7 @@ def compute_insider_sales(df: pl.DataFrame, insider_df: pl.DataFrame) -> pl.Data return result.collect() -def compute_insider_trading_features( - df: pl.DataFrame, insider_df: pl.DataFrame -) -> pl.DataFrame: +def compute_insider_trading_features(df: pl.DataFrame, insider_df: pl.DataFrame) -> pl.DataFrame: """ Compute insider trading features, i.e. insider sales, purchases and balances. @@ -412,13 +237,11 @@ def compute_insider_trading_features( df = compute_insider_purchases(df, insider_df) df = compute_insider_sales(df, insider_df) - df = df.with_columns( - (pl.col("val_sales") - pl.col("val_purch")).alias("insider_balance") - ) + df = df.with_columns((pl.col("val_sales") - pl.col("val_purch")).alias("insider_balance")) return df -def compute_financial_ratios(df: pl.DataFrame) -> pl.DataFrame: +def compute_financial_features(df: pl.DataFrame) -> pl.DataFrame: """ Computes a selected number of financial ratios. @@ -435,9 +258,7 @@ def compute_financial_ratios(df: pl.DataFrame) -> pl.DataFrame: return ( df.lazy() .with_columns( - (pl.col("niq").rolling_sum(4) / pl.col("atq").rolling_mean(2)) - .over("tic") - .alias("roa"), + (pl.col("niq").rolling_sum(4) / pl.col("atq")).over("tic").alias("roa"), (pl.col("niq").rolling_sum(4) / pl.col("seqq")).over("tic").alias("roe"), ((pl.col("saleq") - pl.col("cogsq")) / pl.col("saleq")).alias("gpm"), (pl.col("ebitdaq") / pl.col("saleq")).alias("ebitdam"), @@ -451,12 +272,8 @@ def compute_financial_ratios(df: pl.DataFrame) -> pl.DataFrame: (pl.col("ltq") / pl.col("ebitdaq")).alias("debitda"), (pl.col("dlttq") / pl.col("atq")).alias("ltda"), ((pl.col("oancfq") - pl.col("capxq")) / pl.col("dlttq")).alias("ltcr"), - (pl.col("saleq") / pl.col("invtq").rolling_mean(2)) - .over("tic") - .alias("itr"), - (pl.col("saleq") / pl.col("rectq").rolling_mean(2)) - .over("tic") - .alias("rtr"), + (pl.col("saleq") / pl.col("invtq").rolling_mean(2)).over("tic").alias("itr"), + (pl.col("saleq") / pl.col("rectq").rolling_mean(2)).over("tic").alias("rtr"), (pl.col("saleq") / pl.col("atq").rolling_mean(2)).over("tic").alias("atr"), pl.col("atq").log().alias("size"), ) @@ -464,7 +281,119 @@ def compute_financial_ratios(df: pl.DataFrame) -> pl.DataFrame: ) -def compute_market_ratios( +def compute_sp500_features(df: pl.DataFrame, index_df: pl.DataFrame) -> pl.DataFrame: + """ + Process S&P500 index price data. + + Returns + ------- + pl.DataFrame + Processed data, incl. forward and past return rates. + """ + + index_df = index_df.sort(by=["date"]) + df = df.sort(by=["tdq", "tic"]) + + # compute index past returns + index_df = index_df.with_columns( + pl.col("close").pct_change(config.processing.trading_days_month).alias("index_mom"), + pl.col("close").pct_change(config.processing.trading_days_quarter).alias("index_qoq"), + pl.col("close").pct_change(config.processing.trading_days_semester).alias("index_sos"), + pl.col("close").pct_change(config.processing.trading_days_year).alias("index_yoy"), + pl.col("close").pct_change(config.processing.trading_days_2year).alias("index_2y"), + ) + + # compute volatily of index + index_df = index_df.with_columns( + (pl.col("close") / pl.col("close").shift(1)).log().alias("log_return") + ).with_columns( + pl.col("log_return") + .rolling_std(config.processing.trading_days_month) + .alias("index_vol_mom"), + pl.col("log_return") + .rolling_std(config.processing.trading_days_quarter) + .alias("index_vol_qoq"), + pl.col("log_return") + .rolling_std(config.processing.trading_days_semester) + .alias("index_vol_sos"), + pl.col("log_return") + .rolling_std(config.processing.trading_days_year) + .alias("index_vol_yoy"), + pl.col("log_return") + .rolling_std(config.processing.trading_days_2year) + .alias("index_vol_2y"), + ) + + index_df = index_df.rename( + { + "date": "index_date", + "close": "index_close", + } + ) + + index_df = index_df.select( + [ + "index_date", + "index_close", + "index_mom", + "index_qoq", + "index_sos", + "index_yoy", + "index_2y", + "index_vol_mom", + "index_vol_qoq", + "index_vol_sos", + "index_vol_yoy", + "index_vol_2y", + ] + ) + df = df.join_asof( + index_df, + left_on="tdq", + right_on="index_date", + strategy="backward", + tolerance=dt.timedelta(days=7), + ) + return df.sort(by=["tic", "tdq"]) + + +def compute_vix_features(df: pl.DataFrame, vix_df: pl.DataFrame) -> pl.DataFrame: + """ + Process VIX data. + """ + logger.info("START processing VIX data") + + vix_df = vix_df.sort(by=["date"]) + df = df.sort(by=["tdq", "tic"]) + + vix_df = vix_df.with_columns( + pl.col("close").alias("market_fear"), + pl.col("close").rolling_mean(30).alias("fear_ma30"), + (pl.col("close") > pl.col("close").rolling_max(252) * 0.8).cast(pl.Int8).alias("high_fear"), + (pl.col("close") < pl.col("close").rolling_min(252) * 1.2).cast(pl.Int8).alias("low_fear"), + ) + + vix_df = vix_df.rename({"date": "vix_date"}) + vix_df = vix_df.select( + [ + "vix_date", + "market_fear", + "fear_ma30", + "high_fear", + "low_fear", + ] + ) + df = df.join_asof( + vix_df, + left_on="tdq", + right_on="vix_date", + strategy="backward", + tolerance=dt.timedelta(days=7), + ) + return df.sort(by=["tic", "tdq"]) + + +def compute_market_features( df: pl.DataFrame, market_df: pl.DataFrame, index_df: pl.DataFrame ) -> pl.DataFrame: """ @@ -485,34 +414,25 @@ def compute_market_ratios( Main dataset with added ratios. """ + market_df = compute_volume_features(market_df) market_df = compute_daily_momentum_features(market_df) market_df = compute_daily_volatility_features(market_df) df = df.sort(by=["rdq", "tic"]) - df = ( - df.join_asof( - market_df.drop(["volume"]), - left_on="tdq", - right_on="date", - by="tic", - strategy="backward", - tolerance=dt.timedelta(days=7), - ) - .join_asof( - market_df.select(["date", "tic", "close"]).rename({"close": "rdq_close"}), - left_on="rdq", - right_on="date", - by="tic", - strategy="forward", - tolerance=dt.timedelta(days=7), - ) - .join_asof( - index_df, - left_on="tdq", - right_on="index_date", - strategy="backward", - tolerance=dt.timedelta(days=7), - ) + df = df.join_asof( + market_df.drop(["volume"]), + left_on="tdq", + right_on="date", + by="tic", + strategy="backward", + tolerance=dt.timedelta(days=7), + ).join_asof( + market_df.select(["date", "tic", "close"]).rename({"close": "rdq_close"}), + left_on="rdq", + right_on="date", + by="tic", + strategy="forward", + tolerance=dt.timedelta(days=7), ) df = df.sort(by=["tic", "tdq"]) @@ -520,6 +440,16 @@ def compute_market_ratios( return df +def compute_volume_features(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute volume features. + """ + return df.with_columns( + pl.col("volume").rolling_mean(20).over("tic").alias("volume_ma20"), + pl.col("volume").rolling_mean(50).over("tic").alias("volume_ma50"), + ) + + def compute_daily_momentum_features(df: pl.DataFrame) -> pl.DataFrame: """ Compute daily price momentum features. @@ -535,29 +465,27 @@ def compute_daily_momentum_features(df: pl.DataFrame) -> pl.DataFrame: Market data with momementum features. """ return df.with_columns( - [ - plta.rsi(pl.col("close"), timeperiod=14).over("tic").alias("rsi_14d"), - plta.rsi(pl.col("close"), timeperiod=30).over("tic").alias("rsi_30d"), - plta.rsi(pl.col("close"), timeperiod=60).over("tic").alias("rsi_60d"), - plta.rsi(pl.col("close"), timeperiod=90).over("tic").alias("rsi_90d"), - plta.rsi(pl.col("close"), timeperiod=360).over("tic").alias("rsi_1y"), - pl.col("close") - .pct_change(CONFIG["month_trading_days"]) - .over("tic") - .alias("price_mom"), - pl.col("close") - .pct_change(CONFIG["quarter_trading_days"]) - .over("tic") - .alias("price_qoq"), - pl.col("close") - .pct_change(CONFIG["year_trading_days"]) - .over("tic") - .alias("price_yoy"), - pl.col("close") - .pct_change(CONFIG["2year_trading_days"]) - .over("tic") - .alias("price_2y"), - ] + plta.rsi(pl.col("close"), timeperiod=14).over("tic").alias("rsi_14d"), + plta.rsi(pl.col("close"), timeperiod=30).over("tic").alias("rsi_30d"), + plta.rsi(pl.col("close"), timeperiod=60).over("tic").alias("rsi_60d"), + plta.rsi(pl.col("close"), timeperiod=90).over("tic").alias("rsi_90d"), + plta.rsi(pl.col("close"), timeperiod=360).over("tic").alias("rsi_1y"), + pl.col("close") + .pct_change(config.processing.trading_days_month) + .over("tic") + .alias("price_mom"), + pl.col("close") + .pct_change(config.processing.trading_days_quarter) + .over("tic") + .alias("price_qoq"), + pl.col("close") + .pct_change(config.processing.trading_days_year) + .over("tic") + .alias("price_yoy"), + pl.col("close") + .pct_change(config.processing.trading_days_2year) + .over("tic") + .alias("price_2y"), ) @@ -576,22 +504,19 @@ def compute_daily_volatility_features(df: pl.DataFrame) -> pl.DataFrame: Market data with momementum features. """ df = df.with_columns( - (pl.col("close") / pl.col("close").shift(1)) - .log() - .over("tic") - .alias("log_return") + (pl.col("close") / pl.col("close").shift(1)).log().over("tic").alias("log_return") ) return df.with_columns( pl.col("log_return") - .rolling_std(CONFIG["month_trading_days"]) + .rolling_std(config.processing.trading_days_month) .over("tic") .alias("vol_mom"), pl.col("log_return") - .rolling_std(CONFIG["quarter_trading_days"]) + .rolling_std(config.processing.trading_days_quarter) .over("tic") .alias("vol_qoq"), pl.col("log_return") - .rolling_std(CONFIG["year_trading_days"]) + .rolling_std(config.processing.trading_days_year) .over("tic") .alias("vol_yoy"), ) @@ -632,98 +557,12 @@ def compute_hybrid_features(df: pl.DataFrame) -> pl.DataFrame: .with_columns( (pl.col("mkt_cap") + pl.col("ltq") - pl.col("cheq")).alias("ev"), (pl.col("mkt_cap") / (pl.col("atq") - pl.col("ltq"))).alias("pb"), - (pl.col("mkt_cap") / pl.col("saleq").rolling_sum(4)) - .over("tic") - .alias("ps"), + (pl.col("mkt_cap") / pl.col("saleq").rolling_sum(4)).over("tic").alias("ps"), ) .with_columns( - (pl.col("ev") / pl.col("ebitdaq").rolling_sum(4)) - .over("tic") - .alias("ev_ebitda") - ) - ) - - -def compute_performance_targets(df: pl.DataFrame) -> pl.DataFrame: - """ - Compute target forward performance ratios. - - Parameters - ---------- - df : pl.DataFrame - Main dataset. - - Returns - ------- - pl.DataFrame - Dataset with each observation associated to forward returns and flags. - """ - - df = df.with_columns( - ( - ( - pl.col("index_adj_close").shift(-CONFIG["prediction_horizon"]) - / pl.col("index_adj_close") - ) - - 1 - ) - .over("tic") - .alias("index_freturn"), - ( - ( - pl.col("adj_close").shift(-CONFIG["prediction_horizon"]) - / pl.col("adj_close") - ) - - 1 - ) - .over("tic") - .alias("freturn"), - ((pl.col("adj_close").shift(-1) / pl.col("adj_close")) - 1) - .over("tic") - .alias("freturn_1q"), - ((pl.col("adj_close").shift(-2) / pl.col("adj_close")) - 1) - .over("tic") - .alias("freturn_2q"), - ((pl.col("adj_close").shift(-3) / pl.col("adj_close")) - 1) - .over("tic") - .alias("freturn_3q"), - ((pl.col("adj_close").shift(-4) / pl.col("adj_close")) - 1) - .over("tic") - .alias("freturn_4q"), - ) - df = df.with_columns( - (pl.col("freturn") - pl.col("index_freturn")).alias("adj_freturn") - ) - df = df.with_columns( - (pl.col("adj_freturn") > CONFIG["over_performance_threshold"]) - .cast(pl.Int8) - .alias("adj_fperf"), - (pl.col("freturn_1q") > CONFIG["performance_threshold"]) - .cast(pl.Int8) - .alias("fperf_1q"), - (pl.col("freturn_2q") > CONFIG["performance_threshold"]) - .cast(pl.Int8) - .alias("fperf_2q"), - (pl.col("freturn_3q") > CONFIG["performance_threshold"]) - .cast(pl.Int8) - .alias("fperf_3q"), - (pl.col("freturn_4q") > CONFIG["performance_threshold"]) - .cast(pl.Int8) - .alias("fperf_4q"), - ).with_columns( - ( - ( - pl.col("fperf_1q") - + pl.col("fperf_2q") - + pl.col("fperf_3q") - + pl.col("fperf_4q") - ) - > 0 + (pl.col("ev") / pl.col("ebitdaq").rolling_sum(4)).over("tic").alias("ev_ebitda") ) - .cast(pl.Int8) - .alias("fperf") ) - return df def _compute_growth_rate(col: str, periods: int, suffix: str) -> pl.Expr: @@ -780,7 +619,7 @@ def _compute_signed_growth_rate(col: str, periods: int, suffix: str) -> pl.Expr: ) -def compute_growth_ratios(df: pl.DataFrame) -> pl.DataFrame: +def compute_growth_features(df: pl.DataFrame) -> pl.DataFrame: """ Compute rolling growth statistics for financial metrics. @@ -840,3 +679,169 @@ def compute_growth_ratios(df: pl.DataFrame) -> pl.DataFrame: expressions.append(_compute_growth_rate(metric, period, suffix)) return df.lazy().with_columns(expressions).collect() + + +def compute_piotroski_score(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute Piotroski F-Score components (9 points total): + 1. ROA > 0 (1 point) + 2. Operating Cash Flow > 0 (1 point) + 3. ROA increasing (1 point) + 4. OCF > ROA (1 point) + 5. Decrease in leverage (LTD) (1 point) + 6. Increase in current ratio (1 point) + 7. No new shares issued (1 point) + 8. Increase in gross margin (1 point) + 9. Increase in asset turnover (1 point) + """ + df = df.with_columns( + [ + # Profitability Signals (4 points) + (pl.col("roa") > 0).cast(pl.Int8).alias("f_roa"), + (pl.col("oancfq") > 0).cast(pl.Int8).alias("f_ocf"), + (pl.col("roa") > pl.col("roa").shift(4)).over("tic").cast(pl.Int8).alias("f_droa"), + (pl.col("oancfq").rolling_sum(4) / pl.col("atq") > pl.col("roa")) + .over("tic") + .cast(pl.Int8) + .alias("f_accrual"), + # Leverage, Liquidity, and Source of Funds (3 points) + (pl.col("ltq") < pl.col("ltq").shift(4)).over("tic").cast(pl.Int8).alias("f_dlever"), + (pl.col("cr") > pl.col("cr").shift(4)).over("tic").cast(pl.Int8).alias("f_dliquid"), + (pl.col("cshoq") <= pl.col("cshoq").shift(4)).over("tic").cast(pl.Int8).alias("f_dshr"), + # Operating Efficiency (2 points) + (pl.col("gpm") > pl.col("gpm").shift(4)).over("tic").cast(pl.Int8).alias("f_dgm"), + (pl.col("atr") > pl.col("atr").shift(4)).over("tic").cast(pl.Int8).alias("f_dturn"), + ] + ).with_columns( + [ + # Total F-Score (sum of all components) + ( + pl.col("f_roa") + + pl.col("f_ocf") + + pl.col("f_droa") + + pl.col("f_accrual") + + pl.col("f_dlever") + + pl.col("f_dliquid") + + pl.col("f_dshr") + + pl.col("f_dgm") + + pl.col("f_dturn") + ).alias("f_score") + ] + ) + component_cols = [ + "f_roa", + "f_ocf", + "f_droa", + "f_accrual", + "f_dlever", + "f_dliquid", + "f_dshr", + "f_dgm", + "f_dturn", + ] + return df.drop(component_cols) + + +def compute_performance_targets(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute target forward performance ratios. + + Parameters + ---------- + df : pl.DataFrame + Main dataset. + + Returns + ------- + pl.DataFrame + Dataset with each observation associated to forward returns and flags. + """ + + df = df.with_columns( + ( + ( + pl.col("index_close").shift(-config.processing.prediction_horizon) + / pl.col("index_close") + ) + - 1 + ) + .over("tic") + .alias("index_freturn"), + ( + (pl.col("adj_close").shift(-config.processing.prediction_horizon) / pl.col("adj_close")) + - 1 + ) + .over("tic") + .alias("freturn"), + ((pl.col("adj_close").shift(-1) / pl.col("adj_close")) - 1).over("tic").alias("freturn_1q"), + ((pl.col("adj_close").shift(-2) / pl.col("adj_close")) - 1).over("tic").alias("freturn_2q"), + ((pl.col("adj_close").shift(-3) / pl.col("adj_close")) - 1).over("tic").alias("freturn_3q"), + ((pl.col("adj_close").shift(-4) / pl.col("adj_close")) - 1).over("tic").alias("freturn_4q"), + ) + df = df.with_columns((pl.col("freturn") - pl.col("index_freturn")).alias("adj_freturn")) + df = df.with_columns( + (pl.col("adj_freturn") > config.processing.over_performance_threshold) + .cast(pl.Int8) + .alias("adj_fperf"), + (pl.col("freturn_1q") > config.processing.performance_threshold) + .cast(pl.Int8) + .alias("fperf_1q"), + (pl.col("freturn_2q") > config.processing.performance_threshold) + .cast(pl.Int8) + .alias("fperf_2q"), + (pl.col("freturn_3q") > config.processing.performance_threshold) + .cast(pl.Int8) + .alias("fperf_3q"), + (pl.col("freturn_4q") > config.processing.performance_threshold) + .cast(pl.Int8) + .alias("fperf_4q"), + ).with_columns( + ((pl.col("fperf_1q") + pl.col("fperf_2q") + pl.col("fperf_3q") + pl.col("fperf_4q")) > 0) + .cast(pl.Int8) + .alias("fperf") + ) + return df + + +def clean_data(data: pl.DataFrame) -> pl.DataFrame: + """ + Clean and process financial features dataset. + + Parameters + ---------- + data : pl.DataFrame + Financial features dataset. + + Returns + ------- + pl.DataFrame + Filtered and processed data. + """ + + logger.info("START cleaning data") + + df = data.filter(pl.col("tdq") <= pl.lit(dt.datetime.today().date())) + growth_alias = ["qoq", "yoy", "2y", "return"] + growth_vars = [f for f in df.columns if any(xf in f for xf in growth_alias)] + df = df.filter(~pl.all_horizontal(pl.col("niq_2y").is_null())) + df = df.filter(pl.col("sector").is_in(config.processing.sectors)) + + for feature in [f for f in df.columns if any(xf in f for xf in growth_vars)]: + df = df.with_columns(df.with_columns(pl.col(feature).clip(-30, 30))) + + float_cols = df.select(pl.col(pl.Float64)).columns + df = df.with_columns( + [ + pl.col(col) + .replace(float("inf"), float("nan")) + .replace(float("-inf"), float("nan")) + .alias(col) + for col in float_cols + ] + ) + + df = df.with_columns([pl.col("freturn") * 100, pl.col("adj_freturn") * 100]) + df = df.to_dummies(columns=["sector"]) + + logger.success(f"{df.shape[0]} rows retained after CLEANING") + return df diff --git a/stocksense/pipeline/scraper.py b/stocksense/pipeline/scraper.py index 1db5bf8..504beb0 100644 --- a/stocksense/pipeline/scraper.py +++ b/stocksense/pipeline/scraper.py @@ -5,7 +5,7 @@ import requests import yfinance as yf from bs4 import BeautifulSoup as bs -from config import get_config +from config import config from pyrate_limiter import Duration, Limiter, RequestRate from requests import Session from requests_cache import CacheMixin, SQLiteCache @@ -57,9 +57,7 @@ def _get_market_data_yfinance(self, start_date): Scrape daily market data for a stock, until present. """ df = pl.from_pandas( - self.handler.history(start=start_date, auto_adjust=False).reset_index( - drop=False - ) + self.handler.history(start=start_date, auto_adjust=False).reset_index(drop=False) ) if df.is_empty(): @@ -89,7 +87,7 @@ def _get_stock_info_yfinance(self) -> dict: if not data: raise Exception("No status information data available.") - fields = get_config("scraping")["yahoo_info"] + fields = config.scraping.yahoo_info record = dict.fromkeys(list(fields.values()), None) record["tic"] = self.tic @@ -111,7 +109,7 @@ def _get_fundamental_data_yfinance( :raises Exception: no financial records are available. :return pl.DataFrame: financial report data from yfinance. """ - fields_to_keep = get_config("scraping")["yahoo"] + fields_to_keep = config.scraping.yahoo # retrieve 3 main financial documents is_df = pl.from_pandas(self.handler.quarterly_income_stmt.T.reset_index()) @@ -119,21 +117,13 @@ def _get_fundamental_data_yfinance( cf_df = pl.from_pandas(self.handler.quarterly_cashflow.T.reset_index()) # parse dates - is_df = is_df.with_columns(pl.col("index").dt.date().alias("index")).sort( - "index" - ) - bs_df = bs_df.with_columns(pl.col("index").dt.date().alias("index")).sort( - "index" - ) - cf_df = cf_df.with_columns(pl.col("index").dt.date().alias("index")).sort( - "index" - ) + is_df = is_df.with_columns(pl.col("index").dt.date().alias("index")).sort("index") + bs_df = bs_df.with_columns(pl.col("index").dt.date().alias("index")).sort("index") + cf_df = cf_df.with_columns(pl.col("index").dt.date().alias("index")).sort("index") df = is_df.join_asof( bs_df, on="index", strategy="backward", tolerance=dt.timedelta(days=30) - ).join_asof( - cf_df, on="index", strategy="backward", tolerance=dt.timedelta(days=30) - ) + ).join_asof(cf_df, on="index", strategy="backward", tolerance=dt.timedelta(days=30)) for c in list(fields_to_keep.keys()): if c not in df.columns: @@ -145,9 +135,7 @@ def _get_fundamental_data_yfinance( df = df.select(list(fields_to_keep.keys())) df = df.rename(fields_to_keep) - df = df.filter( - (pl.col("datadate") > start_date) & (pl.col("datadate") <= end_date) - ) + df = df.filter((pl.col("datadate") > start_date) & (pl.col("datadate") <= end_date)) if df.is_empty(): raise Exception("No financial data available for date interval.") @@ -156,9 +144,7 @@ def _get_fundamental_data_yfinance( df = df.with_columns(pl.col(c).cast(pl.Float64)) df = df.with_columns((pl.col(c) / 1000000).round(3).alias(c)) - df = df.with_columns( - [(-pl.col("dvq")).alias("dvq"), (-pl.col("capxq")).alias("capxq")] - ) + df = df.with_columns([(-pl.col("dvq")).alias("dvq"), (-pl.col("capxq")).alias("capxq")]) df = df.unique(subset=["datadate"]).sort("datadate") df = df.with_columns(pl.lit(self.tic).alias("tic")) return df @@ -169,9 +155,7 @@ def _get_earnings_dates_yfinance(self, start_date: dt.date, end_date: dt.date): """ n_quarters = int((end_date - start_date).days / 90) + 20 - df = pl.from_pandas( - self.handler.get_earnings_dates(limit=n_quarters).reset_index() - ) + df = pl.from_pandas(self.handler.get_earnings_dates(limit=n_quarters).reset_index()) df = df.rename({"Earnings Date": "rdq", "Surprise(%)": "surprise_pct"}) @@ -179,11 +163,7 @@ def _get_earnings_dates_yfinance(self, start_date: dt.date, end_date: dt.date): df = df.select(["rdq", "surprise_pct"]) df = df.with_columns(pl.col("rdq").dt.date()) df = df.filter((pl.col("rdq") >= start_date) & (pl.col("rdq") <= end_date)) - df = ( - df.unique(subset=["rdq"]) - .sort("rdq") - .drop_nulls(subset=["surprise_pct", "rdq"]) - ) + df = df.unique(subset=["rdq"]).sort("rdq").drop_nulls(subset=["surprise_pct", "rdq"]) if df.is_empty(): raise Exception("No financial release date available for date interval.") @@ -302,9 +282,7 @@ def get_stock_insider_data(self) -> pl.DataFrame: df = df.with_columns( [ - pl.col("filling_date") - .str.to_datetime("%Y-%m-%d %H:%M:%S") - .dt.date(), + pl.col("filling_date").str.to_datetime("%Y-%m-%d %H:%M:%S").dt.date(), pl.col("trade_date").str.to_date("%Y-%m-%d"), ] ) @@ -367,9 +345,7 @@ def get_exchange_stocks(exc_lis): ) data = r.json()["data"] df = pl.DataFrame(data["rows"]) - df = df.filter(pl.col("marketCap") != "0.00").select( - ["symbol", "name", "sector"] - ) + df = df.filter(pl.col("marketCap") != "0.00").select(["symbol", "name", "sector"]) df = df.filter(~pl.col("symbol").str.contains(r"\.|\^")) stock_list.append(df) return pl.concat(stock_list).unique(subset=["symbol"]) diff --git a/tests/test_demo.py b/tests/test_demo.py index 0f39d62..a93b6b1 100644 --- a/tests/test_demo.py +++ b/tests/test_demo.py @@ -1,8 +1,8 @@ -from config import get_config +from config import config def test_config(): - values = get_config("scraping")["base_date"] + values = config.scraping.base_date assert values == "2005-01-01"