Talk:Link transit

    From Consumerium development wiki R&D Wiki
    Revision as of 04:13, 4 September 2004 by Tim Starling (talk | contribs)

    I'm not sure if our apache is configured to even log internal clicks. Maybe, maybe not. Anyways I don't currently have any software to report or analyze this information. If you know of GPL or otherwise free tools for digging this information from httpd logs pls post here --Juxo 16:47, 1 Sep 2004 (EEST)

    This tends to be expensive software run by ad server companies. But it is certainly in use in all publicly traded search engines like Yahoo and Google, in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.

    I wrote up a basic program to perform this kind of analysis on log files, but I'm not sure why you think it would be useful for either contributors or Bomis. It's certainly not a commonly requested feature. Wouldn't view count data be more useful than link transit data?

    Both are useful for the same reasons. And both are not available. "Server load" is a lousy excuse, when Wikimedia could raise all the money it needed for hardware with an independent board.
    What reasons are those? -- Tim Starling 07:13, 4 Sep 2004 (EEST)

    This matters because I need to know what the output format should be, and I need to have some way to justify using server resources to generate such data.

    A map of nodes/pages with the number of link transits on each edge, such edges representing a link, is the obvious display. But that would be huge so one must be able to filter down to a very small number of pages and links that hold them together, typically the most heavily clustered / deeply connected to each other. Xerox PARC did some research on this about twenty years ago.

    Anyway, following is the result of a couple of hours of procrastination. -- Tim Starling 11:26, 3 Sep 2004 (EEST)

    It looks like C to me and it looks like that the Main() takes standard httpd.log as input. I'll run this on our logs sometime when I have the time. Kinda busy now. --Juxo 13:02, 3 Sep 2004 (EEST)
    It's C++. It outputs two sections separated by a double linefeed. The first is an indexed list of URLs. The second has three values on each line: index from, index to and the transit count. The idea is that you would read all this into a relational database with an index on all three columns, then perform whatever analysis you need to perform. -- Tim Starling 07:13, 4 Sep 2004 (EEST)
    #include <string>
    #include <iostream>
    #include <vector>
    #include <map>
    using namespace std;
    
    #define LINE_BUF_SIZE 1000
    #define REPORTING_INTERVAL 10000
    
    int getUrlIndex(char* s);
    
    class char_order
    {
    public:
    	bool operator()(char* s1, char* s2) 
    	{
    		return strcmp(s1, s2) < 0;
    	}
    };
    
    typedef map<char*, int, char_order> char_map;
    
    typedef char_map::iterator hash_iterator;
    typedef vector<map<int, int> >::iterator vectormap_outer_iterator;
    typedef map<int, int>::iterator vectormap_inner_iterator;
    
    vector<map<int, int> > outbound;
    vector<char*> urls;
    char_map urlHash;
    
    int main(int argc, char** argv) {
    	FILE* file;
    	if (argc == 1) {
    		file = stdin;
    	} else if (argc == 2) {
    		file = fopen(argv[1], "r");
    		if (!file) {
    			printf("Can't open file %s\n", argv[1]);
    			return 1;
    		}
    	} else {
    		printf("Incorrect argument count\n");
    		return 1;
    	}
    
    	
    	char buffer[LINE_BUF_SIZE];
    	int numLines = 0;
    	
    	while (!feof(file)) {
    		numLines = (numLines+1)%REPORTING_INTERVAL;
    		if (numLines == 0) {
    			fprintf(stderr, ".");
    			fflush(stderr);
    		}
    		if (!fgets(buffer, LINE_BUF_SIZE-1, file)) {
    			break;
    		}
    		
    		// Find start of quoted method/URL string
    		char* method = strchr(buffer, '"');
    		if (!method) {
    			continue;
    		}
    		method++;
    
    		// Find end of method, and start of URL
    		char* url = strchr(method, ' ');
    		if (!url) {
    			continue;
    		}
    		*url = '\0';
    		url++;
    
    		// Find end of URL
    		char* referrer = strchr(url, ' ');
    		if (!url) {
    			continue;
    		}
    		*referrer = '\0';
    		referrer++;
    		
    		// If URL does not contain "wiki", skip
    		if (strstr(url,"/wiki/") == NULL) {
    			continue;
    		}
    		
    		// Find start of referrer
    		referrer = strstr(referrer, " \"");
    		if (!referrer) {
    			continue;
    		}
    		referrer += 2;
    
    		// Find end of referrer
    		char* end = strchr(referrer, '"');
    		if (!end) {
    			continue;
    		}
    		*end = '\0';
    		
    		// Obtain indexes
    		int from = getUrlIndex(referrer);
    		int to = getUrlIndex(url);
    
    		// Add to matrix
    		if (outbound.size() < from+1) {
    			outbound.resize(from+1);
    		}
    		outbound[from][to]++;
    	}
    
    	// Output URLs
    	int numUrls = urls.size();
    	for (int i=0; i<numUrls; i++) {
    		printf("%d\t%s\n", i, urls[i]);
    		delete[] urls[i];
    	}
    	printf("\n");
    	for (int i=0; i<outbound.size(); i++) {
    		map<int,int> & row = outbound[i]; 
    		for (vectormap_inner_iterator j=row.begin(); j!=row.end(); j++) {
    			printf("%d\t%d\t%d\n", i, j->first, j->second);
    		}
    	}
    	return 0;
    }
    
    
    int getUrlIndex(char* s)
    {
    	int index;
    	hash_iterator iter = urlHash.find(s);
    	if (iter != urlHash.end()) {
    		index = iter->second;
    	} else {
    		// Copy string to the heap
    		int length = strlen(s)+1;
    		char* newMem = new char[length];
    		memcpy(newMem, s, length);
    
    		// Add to the containers
    		urls.push_back(newMem);
    		index = urls.size() - 1;
    		urlHash[newMem] = index;
    	}
    	return index;
    }