28
edits
(visible right in Google URIs when you follow links to results) |
Tim Starling (talk | contribs) No edit summary |
||
Line 2: | Line 2: | ||
:This tends to be expensive software run by [[ad server]] companies. But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups. | :This tends to be expensive software run by [[ad server]] companies. But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups. | ||
---- | |||
I wrote up a basic program to perform this kind of analysis on log files, but I'm not sure why you think it would be useful for either contributors or Bomis. It's certainly not a commonly requested feature. Wouldn't view count data be more useful than link transit data? This matters because I need to know what the output format should be, and I need to have some way to justify using server resources to generate such data. | |||
Anyway, following is the result of a couple of hours of procrastination. -- [[User:Tim Starling|Tim Starling]] 11:26, 3 Sep 2004 (EEST) | |||
<pre> | |||
#include <string> | |||
#include <iostream> | |||
#include <vector> | |||
#include <map> | |||
using namespace std; | |||
#define LINE_BUF_SIZE 1000 | |||
#define REPORTING_INTERVAL 10000 | |||
int getUrlIndex(char* s); | |||
class char_order | |||
{ | |||
public: | |||
bool operator()(char* s1, char* s2) | |||
{ | |||
return strcmp(s1, s2) < 0; | |||
} | |||
}; | |||
typedef map<char*, int, char_order> char_map; | |||
typedef char_map::iterator hash_iterator; | |||
typedef vector<map<int, int> >::iterator vectormap_outer_iterator; | |||
typedef map<int, int>::iterator vectormap_inner_iterator; | |||
vector<map<int, int> > outbound; | |||
vector<char*> urls; | |||
char_map urlHash; | |||
int main(int argc, char** argv) { | |||
FILE* file; | |||
if (argc == 1) { | |||
file = stdin; | |||
} else if (argc == 2) { | |||
file = fopen(argv[1], "r"); | |||
if (!file) { | |||
printf("Can't open file %s\n", argv[1]); | |||
} | |||
} else { | |||
printf("Incorrect argument count\n"); | |||
return 1; | |||
} | |||
char buffer[LINE_BUF_SIZE]; | |||
int numLines = 0; | |||
while (!feof(file)) { | |||
numLines = (numLines+1)%REPORTING_INTERVAL; | |||
if (numLines == 0) { | |||
fprintf(stderr, "."); | |||
fflush(stderr); | |||
} | |||
if (!fgets(buffer, LINE_BUF_SIZE-1, file)) { | |||
break; | |||
} | |||
// Find start of quoted method/URL string | |||
char* method = strchr(buffer, '"'); | |||
if (!method) { | |||
continue; | |||
} | |||
method++; | |||
// Find end of method, and start of URL | |||
char* url = strchr(method, ' '); | |||
if (!url) { | |||
continue; | |||
} | |||
*url = '\0'; | |||
url++; | |||
// Find end of URL | |||
char* referrer = strchr(url, ' '); | |||
if (!url) { | |||
continue; | |||
} | |||
*referrer = '\0'; | |||
referrer++; | |||
// If URL does not contain "wiki", skip | |||
if (strstr(url,"/wiki/") == NULL) { | |||
continue; | |||
} | |||
// Find start of referrer | |||
referrer = strstr(referrer, " \""); | |||
if (!referrer) { | |||
continue; | |||
} | |||
referrer += 2; | |||
// Find end of referrer | |||
char* end = strchr(referrer, '"'); | |||
if (!end) { | |||
continue; | |||
} | |||
*end = '\0'; | |||
// Obtain indexes | |||
int from = getUrlIndex(referrer); | |||
int to = getUrlIndex(url); | |||
// Add to matrix | |||
if (outbound.size() < from+1) { | |||
outbound.resize(from+1); | |||
} | |||
outbound[from][to]++; | |||
} | |||
// Output URLs | |||
int numUrls = urls.size(); | |||
for (int i=0; i<numUrls; i++) { | |||
printf("%d\t%s\n", i, urls[i]); | |||
delete[] urls[i]; | |||
} | |||
printf("\n"); | |||
for (int i=0; i<outbound.size(); i++) { | |||
map<int,int> & row = outbound[i]; | |||
for (vectormap_inner_iterator j=row.begin(); j!=row.end(); j++) { | |||
printf("%d\t%d\t%d\n", i, j->first, j->second); | |||
} | |||
} | |||
return 0; | |||
} | |||
int getUrlIndex(char* s) | |||
{ | |||
int index; | |||
hash_iterator iter = urlHash.find(s); | |||
if (iter != urlHash.end()) { | |||
index = iter->second; | |||
} else { | |||
// Copy string to the heap | |||
int length = strlen(s)+1; | |||
char* newMem = new char[length]; | |||
memcpy(newMem, s, length); | |||
// Add to the containers | |||
urls.push_back(newMem); | |||
index = urls.size() - 1; | |||
urlHash[newMem] = index; | |||
} | |||
return index; | |||
} | |||
</pre> |
edits