import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15, 9)
%matplotlib inline
import seaborn as sbn
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
#Misc.
import psycopg2
from typing import Union, Tuple, List, Dict, Any, Optional
from pmdarima.arima import ADFTest
from sklearn.metrics import mean_squared_error
from fastdtw import fastdtw
import pmdarima as pm
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import dtw
#Scipy - Clustering and Statistics
import scipy
import scipy.cluster.hierarchy as sch
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform, euclidean
#NetworkX - Graphs
import networkx as nx
import networkx
#RESTful APIs and Scraping
import requests
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import bs4 as bs
# NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import warnings
warnings.filterwarnings("ignore")
#Send a GET request to the Wikipedia API
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies') #Returns the html of the Wikipedia page for S&P500 companies
soup = bs.BeautifulSoup(resp.text, 'lxml') #Parse the webpage from Wikipedia with BeautifulSoup
table = soup.find('table', {'class': 'wikitable sortable'}) #Find the contents under the table tag
tickers = [] #Stores tickers
names = [] #Stores company names
industries = [] #Stores GICS Industry
for row in table.findAll('tr')[1:]: #Iterates through each row in the table excluding the header and extracts relevant data
tickers.append(row.findAll('td')[0].text)
names.append(row.findAll('td')[1].text)
industries.append(row.findAll('td')[3].text)
tickers = [s.replace('\n', '') for s in tickers] #delete newline characters in parsed data
stock_info = pd.DataFrame({'Ticker': tickers, 'Name': names, 'Sector': industries}) #Create Dataframe from Wikipedia data
stock_info = stock_info.replace('Communication Services\n','Communication Services') #Clean messy data
stock_info = stock_info.sort_values(by=['Ticker'], ignore_index = True) #Sort Alphabetically by Ticker
stock_info.head()
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
start_date = datetime.datetime(2019, 4, 1)
end_date = datetime.datetime(2019, 12, 1)
Earnings Before Interest, Taxes, Depreciation, and Amortization ('ebitda'): This indicator measures the overall financial performance, or profitability, of a company. It is oftentimes more accurate an indicator because it represents earnings before accounting and financial deductions.
$EBITDA = NET INCOME + INTEREST + TAXES + DEPRECIATION + AMORTIZATION$
Earnings per Share ('eps'): This indicator measures how much money a company makes for each share of its stock, an indication of corporate value. The larger the Earnings per Share, the more profitable the company is.
$Earnings per Share = \frac{Net Income - Preferred Dividends}{End-of-Period Common Shares Outstanding}$
Enterprise Value ('ev'): This indicator measures the enterprise value of a company, including market capitalization adjusted for short-term and long-term debts as well as any cash on the balance sheet. This is often viewed as the "selling price" that a company would go for.
$Enterprise Value = Market Capitalization + Total Debt - Cash$
Gross Profit Margin ('grossmargin') This indicator measures the portion of each dollar of revenue that a company retains as gross profit.
$Gross Profit = Sales - Cost of Goods Sold$
Price to Earnings Ratio ('pe'): This indicator shows the current share price relative to the Earnings Per Share of a company. Investors use this ratio to determine the relative value of a company's shares.
$P/E = \frac{Market Value per Share}{Earnings per Share}$
Research and Development ('rnd'): This indicator measures a company's activities in driving innovation and development of products to produce new products or services.
Opening Price ('open'): The opening price for a stock is the price exactly at the opening of trading in the New York Stock Exchange at 9:30am EST.
High Price ('high'): The high price for a stock is the peak intraday price of a stock.
Low Price ('low'): The low price for a stock is the lowest intraday price of a stock.
Adjusted Closing Price ('close'): The adjusted closing price is the closing price of a stock adjusted for stock splits, dividends, and rights offerings.
Volume ('volume'): The amount of stocks that change hands over the course of a day for a given company.
Dividends ('dividends'): The distribution a portion of a company's earnings to preferred stockholders.
Unadjusted Closing price ('closeunadj'): The close price for a stock is the price exactly at the close of the New York Stock Exchange at 4:00pm EST.
def extract_data(stocks, start_date, end_date):
"""Queries the PostgreSQL database for Equity and Financial data
for the given list of stocks between the start date and the end date
Args:
stocks: A list of tickers
start_date: A datetime object representing the start date
end_date: A datetime object representing the end date
Returns:
Two dataframes, the first one being equity data and the second
being the fundamental data
"""
fundamental = get_fundamental_filled(stocks, start_date, end_date,
['ebitda',
'eps',
'ev',
'grossmargin',
'pe',
'rnd',
'marketcap'])
equity_data = get_equity_data(stocks, start_date, end_date)
return equity_data, fundamental
#Extract Equity Data and Fundamental Data for each ticker in the S&P500
equity_data, fundamental_data = extract_data(tickers, start_date, end_date)
#Create Dataframes for each variable of interest where columns correspond to the ticker and rows correspond to the date
#EQUITY
close_data = equity_data['close']
#FUNDAMENTAL
ebitda_data = fundamental_data['ebitda']
eps_data = fundamental_data['eps']
ev_data = fundamental_data['ev']
gm_data = fundamental_data['grossmargin']
pe_data = fundamental_data['pe']
rnd_data = fundamental_data['rnd']
mc_data = fundamental_data['marketcap']
ebitda_data.index.names = ['date']
eps_data.index.names = ['date']
ev_data.index.names = ['date']
gm_data.index.names = ['date']
pe_data.index.names = ['date']
rnd_data.index.names = ['date']
mc_data.index.names = ['date']
print(f' Number of stocks in close_data: {close_data.shape[1]}')
print(f' Number of stocks in ev_data: {ev_data.shape[1]}')
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
# For the list of tickers in the S&P500, drop tickers that aren't present in the fundamental data
for ind, x in stock_info['Ticker'].items():
if x not in ev_data.columns:
stock_info = stock_info[stock_info.Ticker != x]
#For each ticker in the close price dataframe, drop tickers that aren't represent in the fundamental data
for x in close_data.columns:
if x not in ev_data.columns:
close_data.drop([x], axis=1, inplace=True)
stock_info.reset_index(inplace = True, drop = True)
print(f' Number of stocks in close_data: {close_data.shape[1]}')
print(f' Number of stocks in ev_data: {ev_data.shape[1]}')
print(f' Number of stocks in stock_info: {stock_info.shape[0]}')
print(f'NaN data in stock_info: {stock_info.isna().sum().sum()}')
print(f'NaN data in close_data: {close_data.isna().sum().sum()}')
print(f'NaN data in ebitda_data: {ebitda_data.isna().sum().sum()}')
print(f'NaN data in gm_data: {gm_data.isna().sum().sum()}')
print(f'NaN data in pe_data: {pe_data.isna().sum().sum()}')
print(f'NaN data in eps_data: {eps_data.isna().sum().sum()}')
print(f'NaN data in ev_data: {ev_data.isna().sum().sum()}')
print(f'NaN data in rnd_data: {rnd_data.isna().sum().sum()}')
print(f'NaN data in mc_data: {mc_data.isna().sum().sum()}')
close_data.fillna(method='ffill', inplace=True)
close_data.fillna(method='bfill', inplace=True)
ebitda_data.fillna(method='ffill', inplace=True)
ebitda_data.fillna(method='bfill', inplace=True)
eps_data.fillna(method='ffill', inplace=True)
eps_data.fillna(method='bfill', inplace=True)
ev_data.fillna(method='ffill', inplace=True)
ev_data.fillna(method='bfill', inplace=True)
gm_data.fillna(method='ffill', inplace=True)
gm_data.fillna(method='bfill', inplace=True)
pe_data.fillna(method='ffill', inplace=True)
pe_data.fillna(method='bfill', inplace=True)
pe_data.dropna(axis=1, inplace=True)
rnd_data.fillna(method='ffill', inplace=True)
rnd_data.fillna(method='bfill', inplace=True)
mc_data.fillna(method='ffill', inplace=True)
mc_data.fillna(method='bfill', inplace=True)
print(f'NaN data in stock_info: {stock_info.isna().sum().sum()}')
print(f'NaN data in close_data: {close_data.isna().sum().sum()}')
print(f'NaN data in ebitda_data: {ebitda_data.isna().sum().sum()}')
print(f'NaN data in gm_data: {gm_data.isna().sum().sum()}')
print(f'NaN data in pe_data: {pe_data.isna().sum().sum()}')
print(f'NaN data in eps_data: {eps_data.isna().sum().sum()}')
print(f'NaN data in ev_data: {ev_data.isna().sum().sum()}')
print(f'NaN data in rnd_data: {rnd_data.isna().sum().sum()}')
print(f'NaN data in mc_data: {mc_data.isna().sum().sum()}')
close_data.head(10)
close_data = close_data.resample('D').ffill().reset_index()
close_data.set_index('date', inplace=True)
ebitda_data = ebitda_data.resample('D').ffill()
eps_data = eps_data.resample('D').ffill()
ev_data = ev_data.resample('D').ffill()
gm_data = gm_data.resample('D').ffill()
pe_data = pe_data.resample('D').ffill()
rnd_data = rnd_data.resample('D').ffill()
mc_data = mc_data.resample('D').ffill()
close_data.head(10)
# Get the S&P500 closing price
sp_data = get_sp_close(start_date, end_date)
sp_data = sp_data.resample('D').ffill().reset_index()
sp_data.set_index('date', inplace=True)
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(sp_data.index, sp_data, label = 'Daily SPY')
ax.plot(sp_data.rolling(10).mean(), label = '10 Day Rolling Mean')
ax.plot(sp_data.rolling(30).mean(), label = '30 Day Rolling Mean')
ax.plot(sp_data.rolling(100).mean(), label = '100 Day Rolling Mean')
ax.set_title('S&P500 Closing Price')
ax.set_xlabel('Date')
ax.set_ylabel('Closing price (in USD)')
ax.legend()
fig.show()
print('S&P500 returns between April 1, 2019 and December 1, 2019: ' + str((sp_data['close'][-1]-sp_data['close'][0])/sp_data['close'][0]*100)+ '%')
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(sp_data.index, stats.zscore(sp_data), label = 'SPY')
ax.set_title("Z-scored Clsoing Price for S&P500")
ax.set_ylabel("Standard Deviations away from mean closing price")
ax.set_xlabel("Date")
plt.show()
sp_data.pct_change().plot.hist(bins = 60)
plt.xlabel("Daily returns %")
plt.ylabel("Frequency")
plt.title("S&P500 daily returns data")
plt.text(-0.03,100,"Extremely Low\nreturns")
plt.text(0.02,100,"Extremely High\nreturns")
plt.show()
top_20_ebitda = ebitda_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_ebitda.index,top_20_ebitda.to_numpy())
ax.set_title('Companies with highest EBIDTA')
fig.show()
top_20_eps = eps_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_eps.index,top_20_eps.to_numpy())
ax.set_title('Companies with highest Earnings per Share')
fig.show()
top_20_pe = pe_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_pe.index,top_20_pe.to_numpy())
ax.set_title('Companies with highest Price to Earnings Ratio')
fig.show()
top_20_rnd = rnd_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_rnd.index,top_20_rnd.to_numpy())
ax.set_title('Companies with highest Research and Development Costs')
fig.show()
top_20_ev = ev_data.mean().sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(20,8))
ax.bar(top_20_ev.index,top_20_ev.to_numpy())
ax.set_title('Companies with highest Enterprise Value')
fig.show()
frequencies = stock_info.groupby('Sector').size()
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(frequencies, labels=frequencies.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
#Calculate the total market cap of the top 10 companies in the S&P500
top_10_mc = mc_data.iloc[-1,].sort_values(ascending=False)[0:10].sum()
#Calculate total market cap of the S&P500
all_mc = mc_data.iloc[-1,].sort_values(ascending=False).sum()
print("\nPercent of Total S&P500 Market Capitalization Represented by top 10 stocks:")
print(top_10_mc/all_mc * 100)
top_20_mc = mc_data.iloc[-1].sort_values(ascending=False)[0:20]
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(top_20_mc.index,top_20_mc.to_numpy())
ax.set_title("S&P500 Stocks with Largest Market Capitalization")
ax.set_ylabel('Market Capitalization')
ax.set_xlabel('Tickers')
fig.show()
average_mc = mc_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
mc_sectors = frequencies.copy()
for i, row in mc_sectors.items():
mc_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_mc.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in mc_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
mc_sectors[k] += row2
for i, val in mc_sectors.items():
mc_sectors[i] /= frequencies[i]
mc_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(mc_sectors, labels=mc_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_ev = ev_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
ev_sectors = frequencies.copy()
for i, row in ev_sectors.items():
ev_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_ev.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in ev_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
ev_sectors[k] += row2
for i, val in ev_sectors.items():
ev_sectors[i] /= frequencies[i]
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(ev_sectors, labels=ev_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_ebitda = ebitda_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
ebitda_sectors = frequencies.copy()
for i, row in ebitda_sectors.items():
ebitda_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_ebitda.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in ebitda_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
ebitda_sectors[k] += row2
for i, val in ebitda_sectors.items():
ebitda_sectors[i] /= frequencies[i]
ebitda_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(ebitda_sectors, labels=ebitda_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_pe = pe_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
pe_sectors = frequencies.copy()
for i, row in pe_sectors.items():
pe_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_pe.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in pe_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
pe_sectors[k] += row2
for i, val in pe_sectors.items():
pe_sectors[i] /= frequencies[i]
pe_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(pe_sectors, labels=pe_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_eps = eps_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
eps_sectors = frequencies.copy().astype('float')
for i, row in eps_sectors.items():
eps_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_eps.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in eps_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
eps_sectors[k] += row2
for i, val in eps_sectors.items():
eps_sectors[i] = float(eps_sectors[i]/frequencies[i])
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(eps_sectors, labels=eps_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
average_rnd = rnd_data.mean().sort_values(ascending=False)
frequencies = stock_info.groupby('Sector').size()
rnd_sectors = frequencies.copy()
for i, row in rnd_sectors.items():
rnd_sectors[i] = 0
for i, row in stock_info.iterrows():
for j, row2 in average_rnd.items():
if(stock_info.iloc[i]['Ticker'] == j):
for k, row3 in rnd_sectors.items():
if (k == stock_info.iloc[i]['Sector']):
rnd_sectors[k] += row2
for i, val in rnd_sectors.items():
rnd_sectors[i] /= frequencies[i]
rnd_sectors
fig1, ax1 = plt.subplots(figsize=(10,10))
ax1.pie(rnd_sectors, labels=rnd_sectors.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
frame = { 'close': close_data['AAPL'][:-2],
'ebidta': ebitda_data['AAPL'],
'eps': eps_data['AAPL'],
'ev': ev_data['AAPL'],
'gm': gm_data['AAPL'],
'pe': pe_data['AAPL'],
'rnd': rnd_data['AAPL'],
'mc': mc_data['AAPL']
}
AAPL = pd.DataFrame(frame)
AAPL.head()
#Fit a linear regression model using OLS in statsmodels (SciKit-Learn does not display summary statistics like p-value)
est = sm.OLS(AAPL['close'], AAPL[['ebidta', 'eps', 'ev', 'gm', 'pe', 'rnd', 'mc']])
est2 = est.fit()
print(est2.summary()) #Print summary statistics
def predict_auto_arima(data, target, columns, split, days_ahead, verbose = True):
sbn.set(font_scale=1.2)
predicted_returns = {}
datasets = []
count = 1
for col in columns:
print(f'Processing ticker {count}')
print(col)
print('\n')
X = data[target][col]
X = X.dropna()
differenced = X.diff()
datasets.append(differenced)
if verbose:
#Plot the Time Series for close data
plt.figure(figsize=(10,10))
plt.plot(X)
plt.title(col)
plt.ylabel(str(target))
plt.xlabel('Date')
plt.show()
#Check if the data is Stationary with Augmented Dicky-Fuller Test
print("Is the Data Stationary? (Augmented Dicky-Fuller Test)")
adf_test = ADFTest(alpha = 0.05)
dicky_res = adf_test.should_diff(X)
print('p-value: '+str(dicky_res[0]))
print('The Time Series is Stationary? '+str(False if dicky_res[1] else True))
print("\n")
train, test = X[0:int(len(X)*split)], X[int(len(X)*split):] #Create train/test split
test_set_range = test.index
train_set_range = train.index
if verbose:
#Plot the time series and denote the training period and the testing period
plt.figure(figsize=(10,10))
plt.plot(train_set_range, train, label = 'train')
plt.plot(test_set_range, test, label = 'test')
plt.title(col)
plt.xlabel('Date')
plt.ylabel(str(target))
plt.legend()
plt.show()
#Plot the ACF graph
plt.figure(figsize=(10,10))
sm.graphics.tsa.plot_acf(X.values.squeeze(), lags=40)
#plt.show()
#Plot the PACF graph
plt.figure(figsize=(10,10))
sm.graphics.tsa.plot_pacf(X.values.squeeze(), lags=40)
#plt.show()
#Automatically fit an ARIMA model to the time series
n_diffs = 2
trace = False
if verbose:
trace = True
else:
trace = False
auto = pm.auto_arima(train, seasonal=False, stepwise=True,
suppress_warnings=True, error_action="ignore", max_p=10, max_q = 10, max_d = 10,
max_order=None, random_state = 9, n_fits = 50, trace=trace)
if verbose:
print(auto.summary())
#Create a dataframe of predicted close price for the period
df = pd.DataFrame()
df["date"] = pd.date_range(test.index[0], periods=days_ahead, freq="D")
df = df.set_index(df["date"])
prediction = pd.DataFrame(auto.predict(n_periods = days_ahead), index = df.index)
prediction.columns = ['predicted_close']
if verbose:
#plot the n-day prediction against the testing period
plt.figure(figsize=(10,10))
plt.plot(train, label = "train")
plt.plot(test, label="test")
plt.plot(prediction, label = "predicted")
plt.title(col)
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()
pred_price = prediction.iloc[-1][0]
print(prediction)
bought_price = train.iloc[-1]
print(bought_price)
print("Predicted Return:")
pred_ret = (pred_price-bought_price)/bought_price
predicted_returns[col] = prediction
print(pred_ret)
print("\n")
print("Actual Return:")
try:
print((test.iloc[days_ahead] - bought_price)/bought_price)
except:
print("Date for Prediction not in Testing Period")
print("\n")
#Calculate a Rolling Daily Prediction using ARIMA and calculate Mean Squared Error
train_data, test_data = X[0:int(len(X)*split)], X[int(len(X)*split):]
training_data = train_data.values
test_data = test_data.values
history = [x for x in training_data]
model_predictions = []
N_test_observations = len(test_data)
for time_point in range(N_test_observations):
model = ARIMA(history, order=(4,2,0))
model_fit = model.fit(disp=2)
output = model_fit.forecast()
yhat = output[0]
model_predictions.append(yhat)
true_test_value = test_data[time_point]
history.append(true_test_value)
MSE_error = mean_squared_error(test_data, model_predictions)
if verbose:
print('Testing Mean Squared Error is {}'.format(MSE_error))
test_set_range = X[int(len(X)*split):].index
if verbose:
#Plot the rolling predictions during the testing period
plt.figure(figsize=(10,10))
plt.plot(test_set_range, model_predictions, color='blue', linestyle='dashed',label='Predicted Price')
plt.plot(test_set_range, test_data, color='red', label='Actual Price')
plt.title(str(col) + ' Closing Price Prediction')
plt.xlabel('Date')
plt.ylabel('Prices')
plt.legend()
plt.show()
count += 1
return predicted_returns, datasets
a, b = predict_auto_arima(equity_data, 'close', ['WMT', 'JPM','BRK.B'], 0.7, 20, verbose=True)
#Create labels for each stock
labs = pd.DataFrame({'Names' : close_data.columns})
#Since this is an incredibly costly algorithm, we will cluster stocks together based on the last 20 days of the time period
latter = close_data.iloc[-4:]
#Compute pairwise Dynamic Time Warping Distance between each pair of stocks in the S&P500, store it as a "correlation matrix"
distance_matrix = []
count = 0
for x in latter.columns:
count += 1
distance_matrix.append([])
for y in latter.columns:
distance, path = fastdtw(stats.zscore(latter[x]), stats.zscore(latter[y]), dist=euclidean)
distance_matrix[count-1].append(distance)
#Convert the Matrix into a Dataframe:
matrix = pd.DataFrame(distance_matrix, index = close_data.columns, columns = close_data.columns)
N = matrix.shape[0]
df = matrix
def plot_corr(df):
corr = df.corr()
# Plot the correlation matrix
fig, ax = plt.subplots(figsize=(20,20))
sbn.heatmap(corr.to_numpy(), xticklabels = False, yticklabels = False, ax=ax, cmap="RdYlGn")
plt.title('Dynamic Time-Warping Distance')
fig.show()
plot_corr(df)
#Perform a heirarchical clustering of the matrix and replot the heatmap for the clusters found
X = df.corr().values
d = sch.distance.pdist(X)
L = sch.linkage(X, method='complete')
ind = sch.fcluster(L, 0.5*d.max(), 'distance')
columns = [df.columns.tolist()[i] for i in list((np.argsort(ind)))]
df = df.reindex(columns, axis=1)
plot_corr(df)
labels = df.index.to_numpy()
p = len(labels)
plt.figure(figsize=(30,30))
plt.title('Hierarchical Clustering Dendrogram (truncated)', fontsize=20)
plt.xlabel('Ticker', fontsize=16)
plt.ylabel('distance', fontsize=16)
# call dendrogram to get the returned dictionary
# (plotting parameters can be ignored at this point)
R = dendrogram(
L,
truncate_mode='lastp', # show only the last p merged clusters
p=p, # show only the last p merged clusters
no_plot=False,
)
# create a label dictionary
temp = {R["leaves"][ii]: labels[ii] for ii in range(len(R["leaves"]))}
def llf(xx):
return "{}".format(temp[xx])
dendrogram(
L,
truncate_mode='lastp', # show only the last p merged clusters
p=p, # show only the last p merged clusters
leaf_label_func=llf,
leaf_rotation=60.,
leaf_font_size=7.,
show_contracted=True, # to get a distribution impression in truncated branches
)
plt.show()
def give_cluster_assigns(df, numclust, transpose=True):
clusters = {}
if transpose==True:
data_dist = pdist(df.transpose())
data_link = linkage(data_dist, metric='correlation', method='complete')
cluster_assigns=pd.Series(sch.fcluster(data_link, numclust, criterion='maxclust'), index=df.columns)
else:
data_dist = pdist(df)
data_link = linkage(data_dist, metric='correlation', method='complete')
cluster_assigns=pd.Series(sch.fcluster(data_link, numclust, criterion='maxclust'), index=df.index)
for i in range(1,numclust+1):
print("Cluster ",str(i),": ( N =",len(cluster_assigns[cluster_assigns==i].index),")", ", ".join(list(cluster_assigns[cluster_assigns==i].index)))
clusters[i] = list(cluster_assigns[cluster_assigns==i].index)
return cluster_assigns, clusters
gg, hh = give_cluster_assigns(df, 6, True)
for j in range(1,7):
print(f'Cluster #{j}')
cluster1 = []
cluster1_industry = []
for i, x in gg.items():
if x == j:
cluster1.append(i)
for x in cluster1:
for a, b in stock_info.iterrows():
if b['Ticker'] == x:
cluster1_industry.append(stock_info.iloc[a]['Sector'])
cluster1_industry = pd.Series(cluster1_industry)
frequencies = cluster1_industry.groupby(cluster1_industry).size()
fig1, ax1 = plt.subplots(figsize=(15, 15))
ax1.pie(frequencies, labels=frequencies.index, autopct='%1.1f%%', startangle=90, )
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
print('Average distance between stocks in the S&P500: ' + str(np.mean(distance_matrix)))
print('Standard Deviation of distance between stocks in the S&P500: ' + str(np.std(distance_matrix)))
graph = np.where(distance_matrix <= np.mean(distance_matrix)-np.std(distance_matrix), 1, 0)
G = nx.from_numpy_matrix(np.array(graph))
nx.draw(G, with_labels=True, labels=dict(labs['Names']))
import itertools
k = 6
comp = nx.algorithms.community.centrality.girvan_newman(G)
for communities in itertools.islice(comp, k):
print(tuple(sorted(c) for c in communities))
graph_vals = pd.DataFrame({'Eigenvector Centrality' : list(nx.algorithms.centrality.eigenvector_centrality(G).values()),
'Closeness Centrality': list(nx.algorithms.centrality.closeness_centrality(G).values()),
'Betweenness Centrality': list(nx.algorithms.centrality.betweenness_centrality(G).values()),
'Degree Centrality': list(nx.algorithms.centrality.degree_centrality(G).values())}, index = list(nx.algorithms.centrality.degree_centrality(G).keys()))
graph_vals.head()
graph_vals.set_index(labs['Names'], inplace=True)
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Closeness Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Closeness Centrality')
plt.ylabel('Closeness Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(20,8))
top_centralities = graph_vals['Betweenness Centrality'].sort_values(ascending=False)[0:15]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Betweenness Centrality')
plt.ylabel('Betweenness Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Degree Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Degree Centrality')
plt.ylabel('Degree Centrality')
plt.xlabel('Ticker')
fig.show()
fig, ax = plt.subplots(figsize=(25,8))
top_centralities = graph_vals['Eigenvector Centrality'].sort_values(ascending=False)[0:40]
plt.bar(top_centralities.index, top_centralities)
plt.title('Stocks in S&P500 with highest Eigenvector Centrality')
plt.ylabel('Eigenvector Centrality')
plt.xlabel('Ticker')
fig.show()
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers2 = tickers
tickers2.append('BRK-B') #Format for these tickers is different on Finviz
tickers2.append('BF-B')
for ticker in tickers2:
try:
url = finwiz_url + ticker #Construct a url to query
req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) #Send a GET request
response = urlopen(req)
# Read the contents of the file into 'html'
html = BeautifulSoup(response)
# Find 'news-table' in the Soup and load it into 'news_table'
news_table = html.find(id='news-table')
# Add the table to our dictionary
news_tables[ticker] = news_table
except:
continue
# Read one single day of headlines for 'AMZN'
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into 'amzn_tr'
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
# Read the text of the element 'a' into 'link_text'
a_text = table_row.a.text
# Read the text of the element 'td' into 'data_text'
td_text = table_row.td.text
# Print the contents of 'link_text' and 'data_text'
print(a_text)
print(td_text)
# Exit after printing 4 rows of data
if i == 3:
break
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
# Iterate through all tr tags in 'news_table'
for x in news_table.findAll('tr'):
# read the text from each tr tag into text
# get text from a only
text = x.a.get_text()
# splite text in the td tag into a list
date_scrape = x.td.text.split()
# if the length of 'date_scrape' is 1, load 'time' as the only element
if len(date_scrape) == 1:
time = date_scrape[0]
# else load 'date' as the 1st element and 'time' as the second
else:
date = date_scrape[0]
time = date_scrape[1]
# Extract the ticker from the file name, get the string up to the 1st '_'
ticker = file_name.split('_')[0]
# Append ticker, date, time and headline as a list to the 'parsed_news' list
parsed_news.append([ticker, date, time, text])
import nltk
nltk.download('vader_lexicon')
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()
stocks = []
sentiments = []
dd = parsed_and_scored_news.groupby(['ticker'])
for key, value in dd:
if key in gg.index:
stocks.append(key)
sentiments.append(value['compound'].mean())
sentiment_info = pd.DataFrame({'Ticker': stocks, 'Sentiment': sentiments}) #Create Dataframe
sentiment_info = sentiment_info.set_index('Ticker')
sentiment_info.head()
print('Average Sentiment for S&P500: '+str(sentiment_info['Sentiment'].mean()))
neg_sent = sentiment_info['Sentiment'].sort_values(ascending=True)[0:10]
plt.figure(figsize=(10,5))
plt.bar(neg_sent.index, neg_sent)
plt.title('Stocks in the S&P500 with lowest sentiment')
plt.show()
pos_sent = sentiment_info['Sentiment'].sort_values(ascending=False)[0:10]
plt.figure(figsize=(10,5))
plt.bar(neg_sent.index, neg_sent)
plt.title('Stocks in the S&P500 with lowest sentiment')
plt.show()
average_sentiment = {}
for i, x in stock_info.iterrows():
for j, y in sentiment_info.iterrows():
if j == x['Ticker']:
try:
average_sentiment[x['Sector']] += y['Sentiment']
except:
average_sentiment[x['Sector']] = y['Sentiment']
for i, x in stock_info.groupby(['Sector']).count()['Name'].items():
average_sentiment[i] /= x
average_sentiment
plt.figure(figsize=(20, 10))
plt.title('Sentiment by Sector')
plt.bar(average_sentiment.keys(), average_sentiment.values())
plt.xticks(rotation=65)
plt.show()
average_sentiment = {}
for i, x in gg.items():
for j, y in sentiment_info.iterrows():
if j == i:
try:
average_sentiment[x] += y['Sentiment']
except:
average_sentiment[x] = y['Sentiment']
for x in average_sentiment.keys():
average_sentiment[x] /= len(hh[x])
average_sentiment
plt.figure(figsize=(20, 10))
plt.title('Sentiment by Cluster')
plt.bar(average_sentiment.keys(), average_sentiment.values())
plt.show()