# mindshare, Jon Udell, udell@monad.net, http://udell.roninhouse.com/
#
# This script unrolls a Yahoo category to create a list of sites,
# then asks AltaVista how many pages point to each site in the list.
# In effect, it measures the Web mindshare of the sites in this category.
#
# If you use this script, please do so judiciously,
# with respect for Yahoo and AltaVista -- two of the Net's
# most valuable resources.
#
#!/usr/bin/perl -w
use LWP::Simple;
my $host = "http://dir.yahoo.com";
#my $root = "/Computers_and_Internet/Software/Operating_Systems/Windows/";
#my $root = "/Science/Nanotechnology/";
my $root = "/Computers_and_Internet/News_and_Media/Magazines/";
#my $root = "/Business_and_Economy/Companies/Computers/Business_to_Business/Software/Internet/World_Wide_Web/";
my $node_pat = "
]+>";
my $leaf_pat = "]+>";
my %seen = ();
my %sites = ();
my %shares = ();
my $domchars = "[a-zA-Z0-9\-]";
# build a hashtable of sites and titles
traverse($root);
# build a hashtable of mindshare numbers for each site
foreach $site (sort keys %sites)
{
$site =~ s/^[^*]+\*//g;
$site =~ m#($domchars+\.$domchars+)(/|$)#;
my $dom = $1; # not perfect: works for .com, not .co.uk, .edu.au, etc.
my $mindshare = mindshare($site,$dom,$sites{$site});
$shares{$site} = $mindshare;
}
# print results ordered by mindshare
print "\n";
my $ord = 0;
foreach $site (sort bynum keys %shares)
{
$ord++;
print sprintf ("| $sites{$site} | $shares{$site} | $ord |
\n" );
}
print "
\n";
sub traverse
{
my ($root) = @_;
my $raw = get "$host$root";
my $leaf_or_node = '';
my $description = '';
my $leaf_or_node_addr = '';
while ( $raw =~ m#($node_pat|$leaf_pat)(.+)#g )
{
$leaf_or_node = $1;
$title = $2;
$leaf_or_node =~ m#\"([^\"]+)\"#;
$leaf_or_node_addr = $1;
$leaf_or_node_addr =~ s###g;
$leaf_or_node_addr =~ s/^[^*]+\*//;
print STDERR "$leaf_or_node_addr\n";
next if ( $leaf_or_node_addr =~ m#yahoo.com# );
if ( defined $seen{$leaf_or_node_addr} )
{
print STDERR "seen: $leaf_or_node_addr, $seen{$leaf_or_node_addr}\n";
$seen{$leaf_or_node_addr}++;
next;
}
else
{
$seen{$leaf_or_node_addr} = 1;
}
if ( $leaf_or_node_addr !~ m#^http# )
{
if (substr($leaf_or_node_addr,0,1) eq '/')
{
traverse ($leaf_or_node_addr);
}
else
{
traverse("$root$leaf_or_node_addr");
}
}
else
{
# print STDERR "\"$leaf_or_node_addr\" => \"$title\"\n";
$site = $leaf_or_node_addr;
$site =~ s#http://##;
$sites{$site} = $title;
}
}
}
sub mindshare
{
my ($site,$dom,$title) = @_;
my $result = get "http://www.altavista.com/cgi-bin/query?pg=q&kl=XX&q=link%3A$site+-url%3A$dom";
# my $result = get "http://www.altavista.com/cgi-bin/query?pg=q&kl=XX&q=link%3A$site";
my $count = 0;
if ( $result =~ m#About ([,\d]+) pages# )
{
$count = $1;
$count =~ s/,//;
}
print STDERR "$dom\t$site\t$title\t$count\n";
return $count;
}
sub bynum
{ return $shares{$b} <=> $shares{$a}; }