Commit b83bb931 authored by Leigh Smith's avatar Leigh Smith

Replaced cross-validation KNN code with sklearn equivalent.

parent 578de14b
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:417a167478ba4dfbc3a0ee4785668d8f8953045aa65b6a192f488670ef2b691d" "signature": "sha256:75f989dd2ab73ca4005602cfbe578941785ac582da4dd30251ada2d364be673a"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
...@@ -92,52 +92,67 @@ ...@@ -92,52 +92,67 @@
"2. 1 test set is tested using the classifier trained on the remaining 9.\n", "2. 1 test set is tested using the classifier trained on the remaining 9.\n",
"3. We then do test/train on all of the other sets and average the percentages. \n", "3. We then do test/train on all of the other sets and average the percentages. \n",
"\n", "\n",
"To achieve the first step (divide our training set into k disjoint subsets), use the function crossvalind.m (posted in the Utilities)\n", "To achieve the first step (divide our training set into k disjoint subsets), use the function [Kfold](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html) in the scikit.learn cross_validation package.\n",
"\n", "\n",
" INDICES = CROSSVALIND('Kfold',N,K) returns randomly generated indices\n", " K-Folds cross validation iterator.\n",
" for a K-fold cross-validation of N observations. INDICES contains equal\n", " Provides train/test indices to split data in train test sets. Split dataset into k consecutive folds (without shuffling).\n",
" (or approximately equal) proportions of the integers 1 through K that\n", "\n",
" define a partition of the N observations into K disjoint subsets.\n", " You can visit the scikit.learn documentation to look at all the other options. This code is also posted as a template in \n",
"\n", " `/usr/ccrma/courses/mir2014/Toolboxes/crossValidationTemplate.m`\n",
" You can type help crossvalind to look at all the other options. This code is also posted as a template in \n", " "
" `/usr/ccrma/courses/mir2010/Toolboxes/crossValidation.m`\n",
"\n",
" % This code is provided as a template for your cross-validation\n",
" % computation. Replace the variables \"features\", \"labels\" with your own\n",
" % data. \n",
" % As well, you can replace the code in the \"BUILD\" and \"EVALUATE\" sections\n",
" % to be useful with other types of Classifiers.\n",
" %\n",
" %% CROSS VALIDATION \n",
" numFolds = 10; % how many cross-validation folds do you want - (default=10)\n",
" numInstances = size(features,1); % this is the total number of instances in our training set\n",
" numFeatures = size(features,2); % this is the total number of instances in our training set\n",
" indices = crossvalind('Kfold',numInstances,numFolds) % divide test set into 10 random subsets\n",
" clear errors\n",
" for i = 1:10\n",
" % SEGMENT DATA INTO FOLDS\n",
" disp(['fold: ' num2str(i)]) \n",
" test = (indices == i) ; % which points are in the test set\n",
" train = ~test; % all points that are NOT in the test set\n",
" % SCALE\n",
" [trainingFeatures,mf,sf]=scale(features(train,:));\n",
" % BUILD NEW MODEL - ADD YOUR MODEL BUILDING CODE HERE...\n",
" model = knn(numFeatures,2,3,trainingFeatures,labels(train,:)); \n",
" % RESCALE TEST DATA TO TRAINING SCALE SPACE\n",
" [testingFeatures]=rescale(features(test,:),mf,sf);\n",
" % EVALUATE WITH TEST DATA - ADD YOUR MODEL EVALUATION CODE HERE\n",
" [voting,model_output] = knnfwd(model ,testingFeatures);\n",
" % CONVERT labels(test,:) LABELS TO SAME FORMAT TO COMPUTE ERROR \n",
" labels_test = zeros(size(model_output,1),1); % create array of 0s\n",
" labels_test(find(labels(test,1)==1))=1; % convert column 1 to class 1 \n",
" labels_test(find(labels(test,2)==1))=2; % convert column 2 to class 2 \n",
" % COUNT ERRORS \n",
" errors(i) = mean ( model_output ~= labels_test )\n",
" end\n",
" disp(['cross validation error: ' num2str(mean(errors))])\n",
" disp(['cross validation accuracy: ' num2str(1-mean(errors))])"
] ]
}, },
{
"cell_type": "code",
"collapsed": false,
"input": [
"def crossValidateKNN(features, labels):\n",
" \"\"\"\n",
" This code is provided as a template for your cross-validation\n",
" computation. Pass into the variables \"features\", \"labels\" your own data. \n",
"\n",
" As well, you can replace the code in the \"BUILD\" and \"EVALUATE\" sections\n",
" to be useful with other types of Classifiers.\n",
" \"\"\"\n",
" #\n",
" # CROSS VALIDATION \n",
" # The features array is arranged as rows of instances, columns of features in our training set.\n",
" numInstances, numFeatures = features.shape\n",
" numFolds = min(10, numInstances) # how many cross-validation folds do you want - (default=10)\n",
" # divide test set into 10 random subsets\n",
" indices = cross_validation.KFold(numInstances, n_folds = numFolds)\n",
"\n",
" errors = np.empty(numFolds)\n",
" for foldIndex, (train_index, test_index) in enumerate(indices):\n",
" # SEGMENT DATA INTO FOLDS\n",
" print('Fold: %d' % foldIndex) \n",
" print(\"TRAIN: %s\" % train_index)\n",
" print(\"TEST: %s\" % test_index)\n",
" \n",
" # SCALE\n",
" trainingFeatures, mf, sf = scale(features.take(train_index, 0))\n",
" # BUILD NEW MODEL - ADD YOUR MODEL BUILDING CODE HERE...\n",
" # model = knn(numFeatures, 2, 3, trainingFeatures, labels[train_index, :]) \n",
" model = KNeighborsClassifier(n_neighbors = 3)\n",
" model.fit(trainingFeatures, labels.take(train_index, 0))\n",
" # RESCALE TEST DATA TO TRAINING SCALE SPACE\n",
" testingFeatures = rescale(features.take(test_index, 0), mf, sf)\n",
" # EVALUATE WITH TEST DATA - ADD YOUR MODEL EVALUATION CODE HERE\n",
" # voting, model_output = knnfwd(model, testingFeatures)\n",
" model_output = model.predict(testingFeatures)\n",
" print(\"KNN prediction %s\" % model_output) # Debugging.\n",
" # CONVERT labels(test,:) LABELS TO SAME FORMAT TO COMPUTE ERROR \n",
" labels_test = labels.take(test_index, 0)\n",
" # COUNT ERRORS. matches is a boolean array, taking the mean does the right thing.\n",
" matches = model_output != labels_test\n",
" errors[foldIndex] = matches.mean()\n",
" print('cross validation error: %f' % errors.mean())\n",
" print('cross validation accuracy: %f' % (1.0 - errors.mean()))"
],
"language": "python",
"metadata": {},
"outputs": []
},
{ {
"cell_type": "code", "cell_type": "code",
"collapsed": false, "collapsed": false,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment