Skip to content

Commit

Permalink
Refactor config file structure and processing pipeline. Lint changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Francisco Silva committed Nov 15, 2024
1 parent feee80f commit 97e2285
Show file tree
Hide file tree
Showing 27 changed files with 694 additions and 607 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ The stock classifier is trained using financial ratios and growth features deriv
- **Model Training**: A classifier using GA-XGBoost with features including growth ratios, financial metrics, price momentum, and volatility.
- **Streamlit App**: A web-based interface for exploring stock metrics, visualizing growth ratios, and viewing model predictions.
- **SQLite Database**: Locally stored market, financials, insider trading and status data for historical and current S&P500 members.
- **Pyproject-based Installation**: Easy setup using `pyproject.toml` for dependency management.

## Installation

Expand Down
43 changes: 14 additions & 29 deletions notebooks/modeling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,19 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import datetime as dt\n",
"\n",
"import plotly.express as px\n",
"import polars as pl\n",
"from config import get_config\n",
"from model import XGBoostModel"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"target_col = \"fperf\"\n",
"date_col = \"tdq\""
"from config import config\n",
"from model import XGBoostModel\n",
"\n",
"target_col = config.model.target\n",
"date_col = config.model.date_col"
]
},
{
Expand Down Expand Up @@ -96,9 +89,7 @@
}
],
"source": [
"data = pl.read_csv(\n",
" \"../data/1_work_data/processed/proc_2024-11-02.csv\", try_parse_dates=True\n",
")\n",
"data = pl.read_csv(\"../data/1_work_data/processed/proc_2024-11-02.csv\", try_parse_dates=True)\n",
"data.head()"
]
},
Expand All @@ -108,8 +99,8 @@
"metadata": {},
"outputs": [],
"source": [
"features = get_config(\"model\")[\"features\"]\n",
"df = data.select([pl.col(\"fperf\")] + [pl.col(ratio) for ratio in features]).to_pandas()"
"features = config.model.features\n",
"df = data.select([pl.col(target_col)] + [pl.col(ratio) for ratio in features]).to_pandas()"
]
},
{
Expand Down Expand Up @@ -1215,14 +1206,11 @@
],
"source": [
"data = data.filter(\n",
" (pl.col(\"tdq\") < last_trade_date)\n",
" & (~pl.all_horizontal(pl.col(target_col).is_null()))\n",
" (pl.col(\"tdq\") < last_trade_date) & (~pl.all_horizontal(pl.col(target_col).is_null()))\n",
")\n",
"\n",
"# filter cols\n",
"aux_cols = [\"datadate\", \"rdq\", \"sector\"] + [\n",
" t for t in get_config(\"model\")[\"targets\"] if t != target_col\n",
"]\n",
"aux_cols = [\"datadate\", \"rdq\", \"sector\"] + [t for t in config.model.targets if t != target_col]\n",
"data = data.select([c for c in data.columns if c not in aux_cols])\n",
"data.head()"
]
Expand Down Expand Up @@ -1647,8 +1635,7 @@
"\n",
"def get_scale():\n",
" scale = round(\n",
" len(train.filter(pl.col(target_col) == 0))\n",
" / len(train.filter(pl.col(target_col) == 1))\n",
" len(train.filter(pl.col(target_col) == 0)) / len(train.filter(pl.col(target_col) == 1))\n",
" )\n",
"\n",
" print(f\"Scale of training data (pos/neg): {scale}\")\n",
Expand Down Expand Up @@ -1725,7 +1712,7 @@
" \"scale_pos_weight\": scale,\n",
" \"eval_metric\": \"logloss\",\n",
" \"nthread\": -1,\n",
" \"seed\": get_config(\"model\")[\"seed\"],\n",
" \"seed\": config.model.seed,\n",
"}\n",
"\n",
"rbf_model = XGBClassifier(**params)\n",
Expand Down Expand Up @@ -2106,9 +2093,7 @@
"\n",
"feature_names = [\n",
" a + \": \" + str(b)\n",
" for a, b in zip(\n",
" X_train.columns, np.abs(shap_values.values).mean(0).round(2), strict=False\n",
" )\n",
" for a, b in zip(X_train.columns, np.abs(shap_values.values).mean(0).round(2), strict=False)\n",
"]\n",
"\n",
"shap.summary_plot(\n",
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ dependencies = [
"watchdog",
"ipykernel",
"shap",
"pre-commit"
"pre-commit",
"pydantic"
]

[project.optional-dependencies]
Expand All @@ -52,7 +53,7 @@ fail_under = 80
where = ["stocksense"]

[tool.ruff]
line-length = 88
line-length = 100
target-version = "py310"

[tool.ruff.lint]
Expand Down
35 changes: 9 additions & 26 deletions stocksense/app/pages/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ def load_processed_data():
csv_files = directory_path.glob("*.csv")

date_files = [
(file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d"))
for file in csv_files
(file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")) for file in csv_files
]
if date_files:
most_recent_file = max(date_files, key=lambda x: x[1])[0]
Expand All @@ -76,14 +75,11 @@ def display_stock_info(stock, info):
st.markdown(f"**Sector**: {stock.loc[0, 'sector']}")
st.markdown(f"**Last price**: {(info.loc[0, 'curr_price']):.2f} $")
st.markdown(f"**Market Cap**: {(info.loc[0, 'market_cap'] / MILLION):.2f} M$")
st.markdown(
f"**Out. Shares**: {(info.loc[0, 'shares_outstanding'] / MILLION):.2f} M"
)
st.markdown(f"**Out. Shares**: {(info.loc[0, 'shares_outstanding'] / MILLION):.2f} M")
st.markdown(f"**Volume**: {(info.loc[0, 'volume'])} M$")
st.markdown(f"**Beta**: {(info.loc[0, 'beta']):.3f}")
st.markdown(
"**Enterprise Value**: "
f"{(info.loc[0, 'enterprise_value'] / MILLION):.2f} M$"
"**Enterprise Value**: " f"{(info.loc[0, 'enterprise_value'] / MILLION):.2f} M$"
)
st.divider()
st.markdown(f"**Trailing PE**: {(info.loc[0, 'fiftytwo_wc']):.2f}")
Expand Down Expand Up @@ -145,10 +141,7 @@ def plot_market_data(df, index_df):
col=1,
)

colors = [
"#27AE60" if dif >= 0 else "#B03A2E"
for dif in df["close"].diff().values.tolist()
]
colors = ["#27AE60" if dif >= 0 else "#B03A2E" for dif in df["close"].diff().values.tolist()]

fig.add_trace(
go.Bar(x=df["date"], y=df["volume"], showlegend=False, marker_color=colors),
Expand All @@ -172,9 +165,7 @@ def plot_financial_data(df):
"""
col = st.selectbox("Select", df.columns[3:], key="financial")
fig = go.Figure()
fig.add_trace(
go.Bar(x=df["rdq"], y=df[col], name=f"{col}", marker_color="orangered")
)
fig.add_trace(go.Bar(x=df["rdq"], y=df[col], name=f"{col}", marker_color="orangered"))
fig.update_layout(template="plotly_dark")
st.plotly_chart(fig, use_container_width=True, theme=None)

Expand Down Expand Up @@ -224,9 +215,7 @@ def plot_insider_data(df):
Plots scatter plot for insider trading data.
"""

df["value"] = (
df["value"].replace({r"\$": "", ",": ""}, regex=True).astype(float).abs()
)
df["value"] = df["value"].replace({r"\$": "", ",": ""}, regex=True).astype(float).abs()

fig = px.scatter(
df,
Expand All @@ -249,9 +238,7 @@ def plot_processed_data(df):
"""
col = st.selectbox("Select", df.columns[15:], key="proc")
fig = go.Figure()
fig.add_trace(
go.Bar(x=df["tdq"], y=df[col], name=f"{col}", marker_color="orangered")
)
fig.add_trace(go.Bar(x=df["tdq"], y=df[col], name=f"{col}", marker_color="orangered"))
st.plotly_chart(fig, use_container_width=True)


Expand Down Expand Up @@ -311,16 +298,12 @@ def main():
with tab1:
display_stock_info(stock, info)
with tab2:
mdf = market[
(market["date"] >= start_dates[selected_range])
& (market["date"] <= max_date)
]
mdf = market[(market["date"] >= start_dates[selected_range]) & (market["date"] <= max_date)]
idf = sp[(sp["date"] >= start_dates[selected_range]) & (sp["date"] <= max_date)]
plot_market_data(mdf, idf)
with tab3:
fdf = financials.loc[
(financials["rdq"] >= start_dates[selected_range])
& (financials["rdq"] <= max_date)
(financials["rdq"] >= start_dates[selected_range]) & (financials["rdq"] <= max_date)
]
plot_financial_data(fdf)
with tab4:
Expand Down
11 changes: 2 additions & 9 deletions stocksense/app/pages/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,7 @@ def load_sp500_data():

financials = db.fetch_financial_data().to_pandas()
financials["rdq"] = pd.to_datetime(financials["rdq"])
financials = (
financials.sort_values("rdq", ascending=False)
.groupby("tic")
.first()
.reset_index()
)
financials = financials.sort_values("rdq", ascending=False).groupby("tic").first().reset_index()
stock_df = stock_df.merge(financials, how="left", on="tic")
return stock_df

Expand Down Expand Up @@ -72,9 +67,7 @@ def show_recent_earnings(data):
"tic": "Stock",
"rdq": st.column_config.DateColumn("Earnings Date", format="YYYY-MM-DD"),
"sector": "Sector",
"curr_price": st.column_config.NumberColumn(
"Current Price", format="$%.2f"
),
"curr_price": st.column_config.NumberColumn("Current Price", format="$%.2f"),
"saleq": st.column_config.NumberColumn("Sales", format="$%.2f"),
"surprise_pct": st.column_config.NumberColumn("Surprise %", format="$%.2f"),
},
Expand Down
6 changes: 4 additions & 2 deletions stocksense/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from config.definitions import ROOT_PATH, get_config
from config.manager import ConfigManager

__all__ = ["ROOT_PATH", "get_config"]
config = ConfigManager()

__all__ = ["ROOT_PATH", "config"]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'schema':
'db_schema':
'stock':
- 'tic'
- 'name'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
- 'n_purch'
- 'n_sales'
- 'insider_balance'
# growth, momentum and volatility features
# market momentum and volatility features
- 'volume_ma20'
- 'volume_ma50'
- 'price_mom'
- 'price_qoq'
- 'price_yoy'
Expand All @@ -23,6 +25,9 @@
- 'momentum_qoq'
- 'momentum_yoy'
- 'momentum_2y'
- 'fear_ma30'
- 'high_fear'
- 'low_fear'
# financial features
- 'gpm'
- 'roa'
Expand All @@ -46,6 +51,7 @@
- 'pb'
- 'ps'
- 'ev_ebitda'
- 'f_score'
# growth features
- 'saleq_yoy'
- 'saleq_2y'
Expand Down
21 changes: 21 additions & 0 deletions stocksense/config/defaults/processing_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
'two_week_trading_days': 10
'month_trading_days': 21
'quarter_trading_days': 61
'semester_trading_days': 126
'year_trading_days': 252
'two_year_trading_days': 504
'prediction_horizon': 4
'over_performance_threshold': 0.1
'performance_threshold': 0.4
'sectors':
- 'Health Care'
- 'Financials'
- 'Industrials'
- 'Consumer Discretionary'
- 'Information Technology'
- 'Communication Services'
- 'Consumer Staples'
- 'Utilities'
- 'Real Estate'
- 'Materials'
- 'Energy'
File renamed without changes.
16 changes: 0 additions & 16 deletions stocksense/config/definitions.py

This file was deleted.

Loading

0 comments on commit 97e2285

Please sign in to comment.