Talk:Link transit: Difference between revisions

    (visible right in Google URIs when you follow links to results)
    No edit summary
    Line 2: Line 2:


    :This tends to be expensive software run by [[ad server]] companies.  But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.
    :This tends to be expensive software run by [[ad server]] companies.  But it is certainly in use in all [[publicly traded search engine]]s like [[Yahoo]] and [[Google]], in fact, you can see the "imgurl" they use to track say which queries led to which image lookups.
    ----
    I wrote up a basic program to perform this kind of analysis on log files, but I'm not sure why you think it would be useful for either contributors or Bomis. It's certainly not a commonly requested feature. Wouldn't view count data be more useful than link transit data? This matters because I need to know what the output format should be, and I need to have some way to justify using server resources to generate such data.
    Anyway, following is the result of a couple of hours of procrastination. -- [[User:Tim Starling|Tim Starling]] 11:26, 3 Sep 2004 (EEST)
    <pre>
    #include <string>
    #include <iostream>
    #include <vector>
    #include <map>
    using namespace std;
    #define LINE_BUF_SIZE 1000
    #define REPORTING_INTERVAL 10000
    int getUrlIndex(char* s);
    class char_order
    {
    public:
    bool operator()(char* s1, char* s2)
    {
    return strcmp(s1, s2) < 0;
    }
    };
    typedef map<char*, int, char_order> char_map;
    typedef char_map::iterator hash_iterator;
    typedef vector<map<int, int> >::iterator vectormap_outer_iterator;
    typedef map<int, int>::iterator vectormap_inner_iterator;
    vector<map<int, int> > outbound;
    vector<char*> urls;
    char_map urlHash;
    int main(int argc, char** argv) {
    FILE* file;
    if (argc == 1) {
    file = stdin;
    } else if (argc == 2) {
    file = fopen(argv[1], "r");
    if (!file) {
    printf("Can't open file %s\n", argv[1]);
    }
    } else {
    printf("Incorrect argument count\n");
    return 1;
    }
    char buffer[LINE_BUF_SIZE];
    int numLines = 0;
    while (!feof(file)) {
    numLines = (numLines+1)%REPORTING_INTERVAL;
    if (numLines == 0) {
    fprintf(stderr, ".");
    fflush(stderr);
    }
    if (!fgets(buffer, LINE_BUF_SIZE-1, file)) {
    break;
    }
    // Find start of quoted method/URL string
    char* method = strchr(buffer, '"');
    if (!method) {
    continue;
    }
    method++;
    // Find end of method, and start of URL
    char* url = strchr(method, ' ');
    if (!url) {
    continue;
    }
    *url = '\0';
    url++;
    // Find end of URL
    char* referrer = strchr(url, ' ');
    if (!url) {
    continue;
    }
    *referrer = '\0';
    referrer++;
    // If URL does not contain "wiki", skip
    if (strstr(url,"/wiki/") == NULL) {
    continue;
    }
    // Find start of referrer
    referrer = strstr(referrer, " \"");
    if (!referrer) {
    continue;
    }
    referrer += 2;
    // Find end of referrer
    char* end = strchr(referrer, '"');
    if (!end) {
    continue;
    }
    *end = '\0';
    // Obtain indexes
    int from = getUrlIndex(referrer);
    int to = getUrlIndex(url);
    // Add to matrix
    if (outbound.size() < from+1) {
    outbound.resize(from+1);
    }
    outbound[from][to]++;
    }
    // Output URLs
    int numUrls = urls.size();
    for (int i=0; i<numUrls; i++) {
    printf("%d\t%s\n", i, urls[i]);
    delete[] urls[i];
    }
    printf("\n");
    for (int i=0; i<outbound.size(); i++) {
    map<int,int> & row = outbound[i];
    for (vectormap_inner_iterator j=row.begin(); j!=row.end(); j++) {
    printf("%d\t%d\t%d\n", i, j->first, j->second);
    }
    }
    return 0;
    }
    int getUrlIndex(char* s)
    {
    int index;
    hash_iterator iter = urlHash.find(s);
    if (iter != urlHash.end()) {
    index = iter->second;
    } else {
    // Copy string to the heap
    int length = strlen(s)+1;
    char* newMem = new char[length];
    memcpy(newMem, s, length);
    // Add to the containers
    urls.push_back(newMem);
    index = urls.size() - 1;
    urlHash[newMem] = index;
    }
    return index;
    }
    </pre>