Talk:Link transit: Difference between revisions

3,512 bytes added ,  3 September 2004
no edit summary
(visible right in Google URIs when you follow links to results)
No edit summary
Line 2: Line 2:


:This tends to be expensive software run by [[ad server]] companies.  But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.
:This tends to be expensive software run by [[ad server]] companies.  But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.
----
I wrote up a basic program to perform this kind of analysis on log files, but I'm not sure why you think it would be useful for either contributors or Bomis. It's certainly not a commonly requested feature. Wouldn't view count data be more useful than link transit data? This matters because I need to know what the output format should be, and I need to have some way to justify using server resources to generate such data.
Anyway, following is the result of a couple of hours of procrastination. -- [[User:Tim Starling|Tim Starling]] 11:26, 3 Sep 2004 (EEST)
<pre>
#include <string>
#include <iostream>
#include <vector>
#include <map>
using namespace std;
#define LINE_BUF_SIZE 1000
#define REPORTING_INTERVAL 10000
int getUrlIndex(char* s);
class char_order
{
public:
bool operator()(char* s1, char* s2)
{
return strcmp(s1, s2) < 0;
}
};
typedef map<char*, int, char_order> char_map;
typedef char_map::iterator hash_iterator;
typedef vector<map<int, int> >::iterator vectormap_outer_iterator;
typedef map<int, int>::iterator vectormap_inner_iterator;
vector<map<int, int> > outbound;
vector<char*> urls;
char_map urlHash;
int main(int argc, char** argv) {
FILE* file;
if (argc == 1) {
file = stdin;
} else if (argc == 2) {
file = fopen(argv[1], "r");
if (!file) {
printf("Can't open file %s\n", argv[1]);
}
} else {
printf("Incorrect argument count\n");
return 1;
}
char buffer[LINE_BUF_SIZE];
int numLines = 0;
while (!feof(file)) {
numLines = (numLines+1)%REPORTING_INTERVAL;
if (numLines == 0) {
fprintf(stderr, ".");
fflush(stderr);
}
if (!fgets(buffer, LINE_BUF_SIZE-1, file)) {
break;
}
// Find start of quoted method/URL string
char* method = strchr(buffer, '"');
if (!method) {
continue;
}
method++;
// Find end of method, and start of URL
char* url = strchr(method, ' ');
if (!url) {
continue;
}
*url = '\0';
url++;
// Find end of URL
char* referrer = strchr(url, ' ');
if (!url) {
continue;
}
*referrer = '\0';
referrer++;
// If URL does not contain "wiki", skip
if (strstr(url,"/wiki/") == NULL) {
continue;
}
// Find start of referrer
referrer = strstr(referrer, " \"");
if (!referrer) {
continue;
}
referrer += 2;
// Find end of referrer
char* end = strchr(referrer, '"');
if (!end) {
continue;
}
*end = '\0';
// Obtain indexes
int from = getUrlIndex(referrer);
int to = getUrlIndex(url);
// Add to matrix
if (outbound.size() < from+1) {
outbound.resize(from+1);
}
outbound[from][to]++;
}
// Output URLs
int numUrls = urls.size();
for (int i=0; i<numUrls; i++) {
printf("%d\t%s\n", i, urls[i]);
delete[] urls[i];
}
printf("\n");
for (int i=0; i<outbound.size(); i++) {
map<int,int> & row = outbound[i];
for (vectormap_inner_iterator j=row.begin(); j!=row.end(); j++) {
printf("%d\t%d\t%d\n", i, j->first, j->second);
}
}
return 0;
}
int getUrlIndex(char* s)
{
int index;
hash_iterator iter = urlHash.find(s);
if (iter != urlHash.end()) {
index = iter->second;
} else {
// Copy string to the heap
int length = strlen(s)+1;
char* newMem = new char[length];
memcpy(newMem, s, length);
// Add to the containers
urls.push_back(newMem);
index = urls.size() - 1;
urlHash[newMem] = index;
}
return index;
}
</pre>
28

edits