Commit 10cf5952 authored by Antoine Rollet's avatar Antoine Rollet

Upload New File

parent f9b5b621
import requests
import pandas as pd
import numpy as np
import time
link="http://www.football-data.co.uk/mmz4281/"
#Lien du site où se trouve les csv
i=11
df=pd.read_csv(link+str(i)+str(i+1)+"/F1.csv")
#On crée un premier DataFrame à partir du premier fichier csv
for i in range(10,18):
df2=pd.read_csv(link+str(i)+str(i+1)+"/F1.csv")
#Lecture des fichiers un par un
df=pd.concat([df,df2],sort=False)
#Ajout des fichiers csv les uns à la suite des autes
#sort=False permet de ne pas réarranger les colonnes par ordre alphabétique
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/E1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/SP1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/D1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/I1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/SC1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#df contient toutes les données des fichiers csv de football-data.co.uk (thanks bro)
##
def flip_date_not_vectorized(date):
if not( type(date).__name__ == "str" and len(date)==8 and date[2]=="/" and date[5]=="/" and date[:2].isdigit() and date[3:5].isdigit() and date[6:].isdigit() ):
return date
return date[6:]+"/"+date[3:5]+"/"+date[:2]
flip_date=np.vectorize(flip_date_not_vectorized)
## Préparation
#On remplace les valeurs manquantes par la moyenne
df.fillna(df.mean(),inplace=True)
#On change le format de la date pour pouvoir trier plus tard
df["Date"].mask(df["Date"]==df["Date"],other=flip_date(df["Date"]),errors="ignore",inplace=True)
## Création des données d'entrée et de sortie
def get_5_last_matches(team,date):
return df.loc[ ((df["HomeTeam"] == team) | (df["AwayTeam"] == team)) & (df["Date"] < date) ].sort_values("Date",ascending=False)[:5]
get_5_last_matches("Caen","17/03/21")
def transform_match_into_input(match,team):
X_input=[]
# On classe le match selon si il est à dom ou à l'ext
if match.HomeTeam==team:
X_input+=[1,0]
else:
X_input+=[0,1]
if X_input[0]==1: #Match à domicile, on ne change pas l'ordre des statistiques
X_input+=list(match[['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC',
'AC', 'HY', 'AY', 'HR', 'AR','B365H','B365D','B365A']].values)
else:
#Match à l'extérieur, on change la place des statistiques
X_input+=list(match[['FTAG', 'FTHG', 'HTAG','HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF',
'AC', 'HC', 'AY', 'HY', 'AR', 'HR','B365A','B365D','B365H']].values)
for x in X_input:
if type(x).__name__ != "float64" and type(x).__name__ != "int":
print(x,type(x).__name__,X_input)
print(match)
return None
return X_input
def make_input_data(match):
team1 = match.HomeTeam
team2 = match.AwayTeam
#On récupère les 5 derniers matchs de chaque équipe
five_last_dom = get_5_last_matches(team1,match.Date)
five_last_ext = get_5_last_matches(team2,match.Date)
if len(five_last_dom)==5 and len(five_last_ext)==5: #On vérifie qu'on en a le bon nombre
X_input_data = [match.B365H,match.B365D,match.B365A]
for i in range(5):
match=five_last_dom.iloc[i,:]
X_input_data+=transform_match_into_input(match,team1)
for i in range(5):
match=five_last_ext.iloc[i,:]
X_input_data+=transform_match_into_input(match,team2)
if len(X_input_data)==213:
return X_input_data
else:
print("err input long X = ",len(X_input_data))
else:
print("erreur input, pas assez de matchs précédents" , match.Date,match.HomeTeam,match.AwayTeam)
def make_output_data(match):
if match.FTR =="H":
return [1,0,0]
elif match.FTR == "D":
return [0,1,0]
elif match.FTR == "A":
return [0,0,1]
else:
print("erreur output illisible", match.FTR)
def make_odds_data(match):
return [match.B365H,match.B365D,match.B365A]
X_data = []
y_data = []
odds_data = []
t0=time.time()
for i in range(df.shape[0]):
match = df.iloc[i,:]
X=make_input_data(match)
y=make_output_data(match)
odds=make_odds_data(match)
if i%100==0:
print(i,df.shape[0])
print(time.time()-t0)
if X != None and y != None and odds != None:
X_data.append(X)
y_data.append(y)
odds_data.append(odds)
print("Temps écoulé ",time.time()-t0)
##
#Attention, les données ne sont pas normalisées
from sklearn.model_selection import train_test_split
X_data=np.array(X_data)
y_data=np.array(y_data)
odds_data=np.array(odds_data)
from sklearn.preprocessing import StandardScaler
sscaler= StandardScaler()
X_data = sscaler.fit_transform(X_data)
X_train,X_test,y_train,y_test, odds_train,odds_test=train_test_split(X_data,y_data,odds_data)
## Création d'un modèle de régression logistique
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
result_model = LogisticRegression(max_iter=500)
result_model.fit(X_train,y_train)
print(result_model.score(X_test,y_test))
## Métriques
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score, recall_score,f1_score
print(confusion_matrix(y_test,result_model.predict(X_test)))
##Amélioration de l'évaluation / éval croisée
from sklearn.model_selection import cross_val_score
print(cross_val_score(result_model,X_train,y_train,cv=5))
## Evaluation du gain
balance=0
for i in range(len(X_test)):
pred=result_model.predict(np.array([X_test[i]]))[0]
if pred==y_test[i]:
balance+=odds_test[i][pred] - 1
else:
balance-=1
print(balance)
## Création d'un modèle de réseau neuronal
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD
neuronal_model = Sequential()
model = Sequential()
#adding layers
n_nodes=250
neuronal_model.add(Dense(n_nodes,activation='linear',input_shape=(X_data.shape[1]-3,)))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
neuronal_model.add(Dense(n_nodes,activation='linear'))
#output layer
neuronal_model.add(Dense(3,activation="softmax"))
#compiling
opt = SGD(lr=0.0001)
neuronal_model.compile(optimizer=opt,loss="categorical_crossentropy",metrics=['accuracy'])
#fiting
early_stopping_monitor = EarlyStopping(patience=5)
neuronal_model.fit(X_train[:,3:],y_train,shuffle=True,verbose=2,epochs=600,validation_split=0.2,callbacks=[early_stopping_monitor])
## Evaluation du gain
import matplotlib.pyplot as plt
H=neuronal_model.predict(X_test[:,3:])
balance=1
reussite=0
nombre=0
for i in range(len(X_test)):
pred=H[i].argmax()
if odds_test[i][pred] > 1/H[i][pred] and H[i][pred]>0.5: #Si la côte est intéressante=espérance >0
# print(odds_test[i],np.round(H[i],3))
# print(np.round(odds_test[i]*H[i]-1,3),y_test[i])
# print("\n")
mise=0.1*balance*H[i][pred]
# mise=1
balance-=mise
# print("balance: ",balance,", mise= ",mise)
# print("\n")
nombre+=1
if y_test[i][pred]==1:
reussite+=1
balance+= odds_test[i][pred]*mise
if i%20==0:
plt.scatter(i,balance)
print(odds_test[i],np.round(H[i],3))
print(np.round(odds_test[i]*H[i]-1,3),y_test[i])
print("\n")
print("balance: ",balance,", mise= ",mise)
print("\n")
plt.show()
print(reussite,nombre)
print(balance)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment