Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
myProjects
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Antoine Rollet
myProjects
Commits
10cf5952
Commit
10cf5952
authored
Nov 27, 2019
by
Antoine Rollet
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
f9b5b621
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
294 additions
and
0 deletions
+294
-0
football_data.py
football_data.py
+294
-0
No files found.
football_data.py
0 → 100644
View file @
10cf5952
import
requests
import
pandas
as
pd
import
numpy
as
np
import
time
link
=
"http://www.football-data.co.uk/mmz4281/"
#Lien du site où se trouve les csv
i
=
11
df
=
pd
.
read_csv
(
link
+
str
(
i
)
+
str
(
i
+
1
)
+
"/F1.csv"
)
#On crée un premier DataFrame à partir du premier fichier csv
for
i
in
range
(
10
,
18
):
df2
=
pd
.
read_csv
(
link
+
str
(
i
)
+
str
(
i
+
1
)
+
"/F1.csv"
)
#Lecture des fichiers un par un
df
=
pd
.
concat
([
df
,
df2
],
sort
=
False
)
#Ajout des fichiers csv les uns à la suite des autes
#sort=False permet de ne pas réarranger les colonnes par ordre alphabétique
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/E1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/SP1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/D1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/I1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#
# for i in range(10,18):
#
# df2=pd.read_csv(link+str(i)+str(i+1)+"/SC1.csv")
# #Lecture des fichiers un par un
#
# df=pd.concat([df,df2],sort=False)
#df contient toutes les données des fichiers csv de football-data.co.uk (thanks bro)
##
def
flip_date_not_vectorized
(
date
):
if
not
(
type
(
date
)
.
__name__
==
"str"
and
len
(
date
)
==
8
and
date
[
2
]
==
"/"
and
date
[
5
]
==
"/"
and
date
[:
2
]
.
isdigit
()
and
date
[
3
:
5
]
.
isdigit
()
and
date
[
6
:]
.
isdigit
()
):
return
date
return
date
[
6
:]
+
"/"
+
date
[
3
:
5
]
+
"/"
+
date
[:
2
]
flip_date
=
np
.
vectorize
(
flip_date_not_vectorized
)
## Préparation
#On remplace les valeurs manquantes par la moyenne
df
.
fillna
(
df
.
mean
(),
inplace
=
True
)
#On change le format de la date pour pouvoir trier plus tard
df
[
"Date"
]
.
mask
(
df
[
"Date"
]
==
df
[
"Date"
],
other
=
flip_date
(
df
[
"Date"
]),
errors
=
"ignore"
,
inplace
=
True
)
## Création des données d'entrée et de sortie
def
get_5_last_matches
(
team
,
date
):
return
df
.
loc
[
((
df
[
"HomeTeam"
]
==
team
)
|
(
df
[
"AwayTeam"
]
==
team
))
&
(
df
[
"Date"
]
<
date
)
]
.
sort_values
(
"Date"
,
ascending
=
False
)[:
5
]
get_5_last_matches
(
"Caen"
,
"17/03/21"
)
def
transform_match_into_input
(
match
,
team
):
X_input
=
[]
# On classe le match selon si il est à dom ou à l'ext
if
match
.
HomeTeam
==
team
:
X_input
+=
[
1
,
0
]
else
:
X_input
+=
[
0
,
1
]
if
X_input
[
0
]
==
1
:
#Match à domicile, on ne change pas l'ordre des statistiques
X_input
+=
list
(
match
[[
'FTHG'
,
'FTAG'
,
'HTHG'
,
'HTAG'
,
'HS'
,
'AS'
,
'HST'
,
'AST'
,
'HF'
,
'AF'
,
'HC'
,
'AC'
,
'HY'
,
'AY'
,
'HR'
,
'AR'
,
'B365H'
,
'B365D'
,
'B365A'
]]
.
values
)
else
:
#Match à l'extérieur, on change la place des statistiques
X_input
+=
list
(
match
[[
'FTAG'
,
'FTHG'
,
'HTAG'
,
'HTHG'
,
'AS'
,
'HS'
,
'AST'
,
'HST'
,
'AF'
,
'HF'
,
'AC'
,
'HC'
,
'AY'
,
'HY'
,
'AR'
,
'HR'
,
'B365A'
,
'B365D'
,
'B365H'
]]
.
values
)
for
x
in
X_input
:
if
type
(
x
)
.
__name__
!=
"float64"
and
type
(
x
)
.
__name__
!=
"int"
:
print
(
x
,
type
(
x
)
.
__name__
,
X_input
)
print
(
match
)
return
None
return
X_input
def
make_input_data
(
match
):
team1
=
match
.
HomeTeam
team2
=
match
.
AwayTeam
#On récupère les 5 derniers matchs de chaque équipe
five_last_dom
=
get_5_last_matches
(
team1
,
match
.
Date
)
five_last_ext
=
get_5_last_matches
(
team2
,
match
.
Date
)
if
len
(
five_last_dom
)
==
5
and
len
(
five_last_ext
)
==
5
:
#On vérifie qu'on en a le bon nombre
X_input_data
=
[
match
.
B365H
,
match
.
B365D
,
match
.
B365A
]
for
i
in
range
(
5
):
match
=
five_last_dom
.
iloc
[
i
,:]
X_input_data
+=
transform_match_into_input
(
match
,
team1
)
for
i
in
range
(
5
):
match
=
five_last_ext
.
iloc
[
i
,:]
X_input_data
+=
transform_match_into_input
(
match
,
team2
)
if
len
(
X_input_data
)
==
213
:
return
X_input_data
else
:
print
(
"err input long X = "
,
len
(
X_input_data
))
else
:
print
(
"erreur input, pas assez de matchs précédents"
,
match
.
Date
,
match
.
HomeTeam
,
match
.
AwayTeam
)
def
make_output_data
(
match
):
if
match
.
FTR
==
"H"
:
return
[
1
,
0
,
0
]
elif
match
.
FTR
==
"D"
:
return
[
0
,
1
,
0
]
elif
match
.
FTR
==
"A"
:
return
[
0
,
0
,
1
]
else
:
print
(
"erreur output illisible"
,
match
.
FTR
)
def
make_odds_data
(
match
):
return
[
match
.
B365H
,
match
.
B365D
,
match
.
B365A
]
X_data
=
[]
y_data
=
[]
odds_data
=
[]
t0
=
time
.
time
()
for
i
in
range
(
df
.
shape
[
0
]):
match
=
df
.
iloc
[
i
,:]
X
=
make_input_data
(
match
)
y
=
make_output_data
(
match
)
odds
=
make_odds_data
(
match
)
if
i
%
100
==
0
:
print
(
i
,
df
.
shape
[
0
])
print
(
time
.
time
()
-
t0
)
if
X
!=
None
and
y
!=
None
and
odds
!=
None
:
X_data
.
append
(
X
)
y_data
.
append
(
y
)
odds_data
.
append
(
odds
)
print
(
"Temps écoulé "
,
time
.
time
()
-
t0
)
##
#Attention, les données ne sont pas normalisées
from
sklearn.model_selection
import
train_test_split
X_data
=
np
.
array
(
X_data
)
y_data
=
np
.
array
(
y_data
)
odds_data
=
np
.
array
(
odds_data
)
from
sklearn.preprocessing
import
StandardScaler
sscaler
=
StandardScaler
()
X_data
=
sscaler
.
fit_transform
(
X_data
)
X_train
,
X_test
,
y_train
,
y_test
,
odds_train
,
odds_test
=
train_test_split
(
X_data
,
y_data
,
odds_data
)
## Création d'un modèle de régression logistique
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.model_selection
import
train_test_split
from
sklearn.linear_model
import
LogisticRegression
result_model
=
LogisticRegression
(
max_iter
=
500
)
result_model
.
fit
(
X_train
,
y_train
)
print
(
result_model
.
score
(
X_test
,
y_test
))
## Métriques
from
sklearn.metrics
import
accuracy_score
,
confusion_matrix
,
precision_score
,
recall_score
,
f1_score
print
(
confusion_matrix
(
y_test
,
result_model
.
predict
(
X_test
)))
##Amélioration de l'évaluation / éval croisée
from
sklearn.model_selection
import
cross_val_score
print
(
cross_val_score
(
result_model
,
X_train
,
y_train
,
cv
=
5
))
## Evaluation du gain
balance
=
0
for
i
in
range
(
len
(
X_test
)):
pred
=
result_model
.
predict
(
np
.
array
([
X_test
[
i
]]))[
0
]
if
pred
==
y_test
[
i
]:
balance
+=
odds_test
[
i
][
pred
]
-
1
else
:
balance
-=
1
print
(
balance
)
## Création d'un modèle de réseau neuronal
from
keras.models
import
Sequential
from
keras.layers
import
Dense
from
keras.callbacks
import
EarlyStopping
from
keras.optimizers
import
SGD
neuronal_model
=
Sequential
()
model
=
Sequential
()
#adding layers
n_nodes
=
250
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
,
input_shape
=
(
X_data
.
shape
[
1
]
-
3
,)))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
neuronal_model
.
add
(
Dense
(
n_nodes
,
activation
=
'linear'
))
#output layer
neuronal_model
.
add
(
Dense
(
3
,
activation
=
"softmax"
))
#compiling
opt
=
SGD
(
lr
=
0.0001
)
neuronal_model
.
compile
(
optimizer
=
opt
,
loss
=
"categorical_crossentropy"
,
metrics
=
[
'accuracy'
])
#fiting
early_stopping_monitor
=
EarlyStopping
(
patience
=
5
)
neuronal_model
.
fit
(
X_train
[:,
3
:],
y_train
,
shuffle
=
True
,
verbose
=
2
,
epochs
=
600
,
validation_split
=
0.2
,
callbacks
=
[
early_stopping_monitor
])
## Evaluation du gain
import
matplotlib.pyplot
as
plt
H
=
neuronal_model
.
predict
(
X_test
[:,
3
:])
balance
=
1
reussite
=
0
nombre
=
0
for
i
in
range
(
len
(
X_test
)):
pred
=
H
[
i
]
.
argmax
()
if
odds_test
[
i
][
pred
]
>
1
/
H
[
i
][
pred
]
and
H
[
i
][
pred
]
>
0.5
:
#Si la côte est intéressante=espérance >0
# print(odds_test[i],np.round(H[i],3))
# print(np.round(odds_test[i]*H[i]-1,3),y_test[i])
# print("\n")
mise
=
0.1
*
balance
*
H
[
i
][
pred
]
# mise=1
balance
-=
mise
# print("balance: ",balance,", mise= ",mise)
# print("\n")
nombre
+=
1
if
y_test
[
i
][
pred
]
==
1
:
reussite
+=
1
balance
+=
odds_test
[
i
][
pred
]
*
mise
if
i
%
20
==
0
:
plt
.
scatter
(
i
,
balance
)
print
(
odds_test
[
i
],
np
.
round
(
H
[
i
],
3
))
print
(
np
.
round
(
odds_test
[
i
]
*
H
[
i
]
-
1
,
3
),
y_test
[
i
])
print
(
"
\n
"
)
print
(
"balance: "
,
balance
,
", mise= "
,
mise
)
print
(
"
\n
"
)
plt
.
show
()
print
(
reussite
,
nombre
)
print
(
balance
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment