Talk:Link transit: Difference between revisions

Talk:Link transit (edit)

Revision as of 08:26, 3 September 2004

3,512 bytes added , 3 September 2004

no edit summary

Tim Starling

28

edits

@@ Line 2: / Line 2: @@
 :This tends to be expensive software run by [[ad server]] companies.  But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.
+----
+I wrote up a basic program to perform this kind of analysis on log files, but I'm not sure why you think it would be useful for either contributors or Bomis. It's certainly not a commonly requested feature. Wouldn't view count data be more useful than link transit data? This matters because I need to know what the output format should be, and I need to have some way to justify using server resources to generate such data.
+Anyway, following is the result of a couple of hours of procrastination. -- [[User:Tim Starling|Tim Starling]] 11:26, 3 Sep 2004 (EEST)
+<pre>
+#include <string>
+#include <iostream>
+#include <vector>
+#include <map>
+using namespace std;
+#define LINE_BUF_SIZE 1000
+#define REPORTING_INTERVAL 10000
+int getUrlIndex(char* s);
+class char_order
+{
+public:
+	bool operator()(char* s1, char* s2)
+	{
+		return strcmp(s1, s2) < 0;
+	}
+};
+typedef map<char*, int, char_order> char_map;
+typedef char_map::iterator hash_iterator;
+typedef vector<map<int, int> >::iterator vectormap_outer_iterator;
+typedef map<int, int>::iterator vectormap_inner_iterator;
+vector<map<int, int> > outbound;
+vector<char*> urls;
+char_map urlHash;
+int main(int argc, char** argv) {
+	FILE* file;
+	if (argc == 1) {
+		file = stdin;
+	} else if (argc == 2) {
+		file = fopen(argv[1], "r");
+		if (!file) {
+			printf("Can't open file %s\n", argv[1]);
+		}
+	} else {
+		printf("Incorrect argument count\n");
+		return 1;
+	}
+	char buffer[LINE_BUF_SIZE];
+	int numLines = 0;
+	while (!feof(file)) {
+		numLines = (numLines+1)%REPORTING_INTERVAL;
+		if (numLines == 0) {
+			fprintf(stderr, ".");
+			fflush(stderr);
+		}
+		if (!fgets(buffer, LINE_BUF_SIZE-1, file)) {
+			break;
+		}
+		// Find start of quoted method/URL string
+		char* method = strchr(buffer, '"');
+		if (!method) {
+			continue;
+		}
+		method++;
+		// Find end of method, and start of URL
+		char* url = strchr(method, ' ');
+		if (!url) {
+			continue;
+		}
+		*url = '\0';
+		url++;
+		// Find end of URL
+		char* referrer = strchr(url, ' ');
+		if (!url) {
+			continue;
+		}
+		*referrer = '\0';
+		referrer++;
+		// If URL does not contain "wiki", skip
+		if (strstr(url,"/wiki/") == NULL) {
+			continue;
+		}
+		// Find start of referrer
+		referrer = strstr(referrer, " \"");
+		if (!referrer) {
+			continue;
+		}
+		referrer += 2;
+		// Find end of referrer
+		char* end = strchr(referrer, '"');
+		if (!end) {
+			continue;
+		}
+		*end = '\0';
+		// Obtain indexes
+		int from = getUrlIndex(referrer);
+		int to = getUrlIndex(url);
+		// Add to matrix
+		if (outbound.size() < from+1) {
+			outbound.resize(from+1);
+		}
+		outbound[from][to]++;
+	}
+	// Output URLs
+	int numUrls = urls.size();
+	for (int i=0; i<numUrls; i++) {
+		printf("%d\t%s\n", i, urls[i]);
+		delete[] urls[i];
+	}
+	printf("\n");
+	for (int i=0; i<outbound.size(); i++) {
+		map<int,int> & row = outbound[i];
+		for (vectormap_inner_iterator j=row.begin(); j!=row.end(); j++) {
+			printf("%d\t%d\t%d\n", i, j->first, j->second);
+		}
+	}
+	return 0;
+}
+int getUrlIndex(char* s)
+{
+	int index;
+	hash_iterator iter = urlHash.find(s);
+	if (iter != urlHash.end()) {
+		index = iter->second;
+	} else {
+		// Copy string to the heap
+		int length = strlen(s)+1;
+		char* newMem = new char[length];
+		memcpy(newMem, s, length);
+		// Add to the containers
+		urls.push_back(newMem);
+		index = urls.size() - 1;
+		urlHash[newMem] = index;
+	}
+	return index;
+}
+</pre>