// Driver code for Minimum Spanning Tree based clustering. #include #include #include #include #include // Any debugging output in sample and graph code // should be conditional on the value of TRACE. // If TRACE == 0, there should be no trace output. // It is up to you what you want to do with different // non-zero values of TRACE. #define TRACE 0 // Students need to define the classes // Sample in sample.hpp // Graph in graph.hpp #include "sample.hpp" #include "graph.hpp" using namespace std; typedef vector SampleSet; // testClustering(dataFile, rank, clusterLimit) // Reads in the given data file, creates a complete Graph of // all the samples, with edge weight[i, j]s = Euclidean distance // between sample i and sample j, tells Graph to partition // the data into at most clusterLimit clusters, and prints // the clusters. // // Parameters: // dataFile: name of file with sample data // rank: number of data values in each sample // clusterLimit: maximum number of clusters to allow void testClustering(const char *dataFile, int rank, int clusterLimit); // loadSamples(dataFile, rank, sampleSet) // Reads the samples in the given data file into sampleSet. // // Parameters: // - dataFile: name of file with sample data // - rank: number of data values in each sample // - sampleSet: an empty container of Sample objects void loadSamples(const char* dataFile, int rank, SampleSet &samples); // timer() // Returns the number of seconds since the last call to timer(). double timer(); int main( ) { testClustering("toy-data.txt", 2, 10); testClustering("tumors.txt", 16, 10); } void testClustering(const char *dataFile, int rank, int clusterLimit) { SampleSet samples; cout << endl << endl << "Loading " << dataFile << endl; timer(); loadSamples(dataFile, rank, samples); cout << samples.size() << " samples read in " << timer() << " seconds" << endl; cout << "Creating graph from vertices" << endl; Graph sampleGraph(samples.begin(), samples.end()); cout << sampleGraph.vertexSize() << " vertices created in " << timer() << " seconds" << endl; cout << "Adding edges to graph" << endl; for (int i = 0; i < samples.size(); ++i) { Sample s1 = samples[i]; for (int j = i + 1; j < samples.size(); ++j) { Sample s2 = samples[j]; sampleGraph.addEdge(s1, s2, s1.euclideanDistance(s2)); } } cout << sampleGraph.edgeSize() << " edges created in " << timer() << " seconds" << endl; cout << "Making MST" << endl; int k = sampleGraph.partition(clusterLimit); cout << "Best partition size = " << k << ", found in " << timer() << " seconds" << endl; for (int i = 0; i < k; ++i) { cout << "Cluster " << i << " contains: "; SampleSet nodes; sampleGraph.getPartition(i, back_inserter(nodes)); for (int j = 0; j < nodes.size(); ++j) { if (j > 0 && (j % 10 == 0)) cout << endl; cout << nodes[j].id << " " << endl; } } } double timer() { static clock_t prev = 0; clock_t old = prev; prev = clock(); return (double) (prev - old) / CLOCKS_PER_SEC; } void loadSamples(const char* dataFile, int rank, SampleSet &samples) { ifstream in(dataFile); if (!in) { cout << "Couldn't open " << dataFile << endl; exit(-1); } string temp; if (!getline(in, temp)) { cout << "Couldn't read column headers" << endl; exit(-1); } Sample sample(rank); while (in >> sample) { samples.push_back(sample); } }