Replaced cross-validation KNN code with sklearn equivalent.

b83bb931 · Leigh Smith · 578de14b · b83bb931
Commit b83bb931 authored Jun 20, 2014 by Leigh Smith
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 45 deletions

lab2.ipynb notebooks/lab2.ipynb +60 -45

No files found.
--- a/notebooks/lab2.ipynb
+++ b/notebooks/lab2.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:417a167478ba4dfbc3a0ee4785668d8f8953045aa65b6a192f488670ef2b691d"
+  "signature": "sha256:75f989dd2ab73ca4005602cfbe578941785ac582da4dd30251ada2d364be673a"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@@ -92,52 +92,67 @@
      "2. 1 test set is tested using the classifier trained on the remaining 9.\n",
      "3. We then do test/train on all of the other sets and average the percentages. \n",
      "\n",
-      "To achieve the first step (divide our training set into k disjoint subsets), use the function crossvalind.m (posted in the Utilities)\n",
-      "\n",
-      "    INDICES = CROSSVALIND('Kfold',N,K) returns randomly generated indices\n",
-      "    for a K-fold cross-validation of N observations. INDICES contains equal\n",
-      "    (or approximately equal) proportions of the integers 1 through K that\n",
-      "    define a partition of the N observations into K disjoint subsets.\n",
-      "\n",
-      " You can type help crossvalind to look at all the other options. This code is also posted as a template in \n",
-      " `/usr/ccrma/courses/mir2010/Toolboxes/crossValidation.m`\n",
-      "\n",
-      "     % This code is provided as a template for your cross-validation\n",
-      "     % computation. Replace the variables \"features\", \"labels\" with your own\n",
-      "     % data. \n",
-      "     % As well, you can replace the code in the \"BUILD\" and \"EVALUATE\" sections\n",
-      "     % to be useful with other types of Classifiers.\n",
-      "     %\n",
-      "     %% CROSS VALIDATION \n",
-      "     numFolds = 10; % how many cross-validation folds do you want - (default=10)\n",
-      "     numInstances = size(features,1); % this is the total number of instances in our training set\n",
-      "     numFeatures = size(features,2); % this is the total number of instances in our training set\n",
-      "     indices = crossvalind('Kfold',numInstances,numFolds) % divide test set into 10 random subsets\n",
-      "     clear errors\n",
-      "     for i = 1:10\n",
-      "         % SEGMENT DATA INTO FOLDS\n",
-      "         disp(['fold: ' num2str(i)]) \n",
-      "         test = (indices == i) ; % which points are in the test set\n",
-      "         train = ~test; % all points that are NOT in the test set\n",
-      "         % SCALE\n",
-      "         [trainingFeatures,mf,sf]=scale(features(train,:));\n",
-      "         % BUILD NEW MODEL - ADD YOUR MODEL BUILDING CODE HERE...\n",
-      "         model = knn(numFeatures,2,3,trainingFeatures,labels(train,:)); \n",
-      "         % RESCALE TEST DATA TO TRAINING SCALE SPACE\n",
-      "         [testingFeatures]=rescale(features(test,:),mf,sf);\n",
-      "         % EVALUATE WITH TEST DATA - ADD YOUR MODEL EVALUATION CODE HERE\n",
-      "         [voting,model_output] = knnfwd(model ,testingFeatures);\n",
-      "         % CONVERT labels(test,:) LABELS TO SAME FORMAT TO COMPUTE ERROR \n",
-      "         labels_test = zeros(size(model_output,1),1); % create array of 0s\n",
-      "         labels_test(find(labels(test,1)==1))=1; % convert column 1 to class 1 \n",
-      "         labels_test(find(labels(test,2)==1))=2; % convert column 2 to class 2 \n",
-      "         % COUNT ERRORS \n",
-      "         errors(i) = mean ( model_output ~= labels_test )\n",
-      "     end\n",
-      "     disp(['cross validation error: ' num2str(mean(errors))])\n",
-      "     disp(['cross validation accuracy: ' num2str(1-mean(errors))])"
+      "To achieve the first step (divide our training set into k disjoint subsets), use the function [Kfold](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html) in the scikit.learn cross_validation package.\n",
+      "\n",
+      "    K-Folds cross validation iterator.\n",
+      "    Provides train/test indices to split data in train test sets. Split dataset into k consecutive folds (without shuffling).\n",
+      "\n",
+      " You can visit the scikit.learn documentation to look at all the other options. This code is also posted as a template in \n",
+      " `/usr/ccrma/courses/mir2014/Toolboxes/crossValidationTemplate.m`\n",
+      "     "
     ]
    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def crossValidateKNN(features, labels):\n",
+      "    \"\"\"\n",
+      "    This code is provided as a template for your cross-validation\n",
+      "    computation. Pass into the variables \"features\", \"labels\" your own data. \n",
+      "\n",
+      "    As well, you can replace the code in the \"BUILD\" and \"EVALUATE\" sections\n",
+      "    to be useful with other types of Classifiers.\n",
+      "    \"\"\"\n",
+      "    #\n",
+      "    # CROSS VALIDATION \n",
+      "    # The features array is arranged as rows of instances, columns of features in our training set.\n",
+      "    numInstances, numFeatures = features.shape\n",
+      "    numFolds = min(10, numInstances) # how many cross-validation folds do you want - (default=10)\n",
+      "    # divide test set into 10 random subsets\n",
+      "    indices = cross_validation.KFold(numInstances, n_folds = numFolds)\n",
+      "\n",
+      "    errors = np.empty(numFolds)\n",
+      "    for foldIndex, (train_index, test_index) in enumerate(indices):\n",
+      "        # SEGMENT DATA INTO FOLDS\n",
+      "        print('Fold: %d' % foldIndex) \n",
+      "        print(\"TRAIN: %s\" % train_index)\n",
+      "        print(\"TEST: %s\" % test_index)\n",
+      "    \n",
+      "        # SCALE\n",
+      "        trainingFeatures, mf, sf = scale(features.take(train_index, 0))\n",
+      "        # BUILD NEW MODEL - ADD YOUR MODEL BUILDING CODE HERE...\n",
+      "        # model = knn(numFeatures, 2, 3, trainingFeatures, labels[train_index, :]) \n",
+      "        model = KNeighborsClassifier(n_neighbors = 3)\n",
+      "        model.fit(trainingFeatures, labels.take(train_index, 0))\n",
+      "        # RESCALE TEST DATA TO TRAINING SCALE SPACE\n",
+      "        testingFeatures = rescale(features.take(test_index, 0), mf, sf)\n",
+      "        # EVALUATE WITH TEST DATA - ADD YOUR MODEL EVALUATION CODE HERE\n",
+      "        # voting, model_output = knnfwd(model, testingFeatures)\n",
+      "        model_output = model.predict(testingFeatures)\n",
+      "        print(\"KNN prediction %s\" % model_output) # Debugging.\n",
+      "        # CONVERT labels(test,:) LABELS TO SAME FORMAT TO COMPUTE ERROR \n",
+      "        labels_test = labels.take(test_index, 0)\n",
+      "        # COUNT ERRORS. matches is a boolean array, taking the mean does the right thing.\n",
+      "        matches = model_output != labels_test\n",
+      "        errors[foldIndex] = matches.mean()\n",
+      "    print('cross validation error: %f' % errors.mean())\n",
+      "    print('cross validation accuracy: %f' % (1.0 - errors.mean()))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
    {
     "cell_type": "code",
     "collapsed": false,