User:DCausse/Completion Suggester And Pageviews
Appearance
Dump that contains all variable needed to compute the score on enwiki : stats_enwiki_with_pv.csv.gz
Small R script to experiment with the score
dat = read.csv("stats_enwiki_with_pv.csv", header=TRUE, sep=",")
pop_score <- function(maxDocs, popScore) {
POPULARITY_MAX = 0.0004;
if( popScore > POPULARITY_MAX ) {
return ( 1 );
}
# broken log scale with maxDocs...
# working on ratio (pv/total_project_pv) is difficult here, I was wrong :(
pop = logb(1+(popScore*maxDocs), 1+(POPULARITY_MAX*maxDocs));
return (pop);
}
# Score function
popqual_score <- function(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost, popScore) {
QSCORE_WEIGHT = 1;
POPULARITY_WEIGHT = 0.4;
SCORE_RANGE = 10000000;
qual <- qual_score(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost);
pop <- pop_score(maxDocs, popScore);
score = (qual * QSCORE_WEIGHT + pop * POPULARITY_WEIGHT) / (QSCORE_WEIGHT + POPULARITY_WEIGHT)
return (score * SCORE_RANGE);
}
qual_score <- function(maxDocs, incomingLinksRaw, externalLinksRaw, bytesRaw, headingsRaw, redirectsRaw, tmplBoost) {
INCOMING_LINKS_MAX_DOCS_FACTOR = 1/10;
EXTERNAL_LINKS_NORM = 20;
PAGE_SIZE_NORM = 50000;
HEADING_NORM = 20;
REDIRECT_NORM = 30;
INCOMING_LINKS_WEIGHT = 0.6;
EXTERNAL_LINKS_WEIGHT = 0.1;
PAGE_SIZE_WEIGHT = 0.1;
HEADING_WEIGHT = 0.2;
REDIRECT_WEIGHT = 0.1;
# OUTGOING_LINK_BLANCE = 2000;
# HEADINGS_BALANCE = 3000;
# EXT_LINKS_BALANCE = 2000;
SCORE_RANGE = 10000000;
# If a page gets linked by more than 1/10 of all pages.
incLinksNorm <- maxDocs * INCOMING_LINKS_MAX_DOCS_FACTOR;
incLinks <- scoreNormL2(incomingLinksRaw, incLinksNorm)
pageSize <- scoreNormL2(bytesRaw, PAGE_SIZE_NORM)
extLinks <- scoreNorm(externalLinksRaw, EXTERNAL_LINKS_NORM)
headings <- scoreNorm(headingsRaw, HEADING_NORM)
redirects <- scoreNorm(redirectsRaw, REDIRECT_NORM)
score <- incLinks * INCOMING_LINKS_WEIGHT;
score <- score + extLinks * EXTERNAL_LINKS_WEIGHT;
score <- score + pageSize * PAGE_SIZE_WEIGHT;
score <- score + headings * HEADING_WEIGHT;
score <- score + redirects * REDIRECT_WEIGHT;
score <- score / (INCOMING_LINKS_WEIGHT + EXTERNAL_LINKS_WEIGHT + PAGE_SIZE_WEIGHT + HEADING_WEIGHT + REDIRECT_WEIGHT);
# headingsBalance <- bytesRaw / headingsRaw;
# headingDistance <- HEADINGS_BALANCE - headingsBalance;
# extLinksBalance <- externalLinksRaw / bytesRaw;
# extLinksDistance <- EXT_LINKS_BALANCE - headingsBalance;
#score <- score * (1-1/abs(headingDistance))
#score <- score * (1-1/abs(extLinksDistance))
score <- boost(score, tmplBoost);
return (score);
}
# log2(value/norm + 1)
scoreNormL2 <- function(value, norm) {
if(value > norm) {
value <- norm;
}
return(log2((value/norm) +1));
}
# simple ratio
scoreNorm <- function(value, norm) {
if(value > norm) {
value <- norm;
}
return(value/norm);
}
boost <- function(score, boost) {
if(boost > 1) {
boost <- 1 - ( 1 / boost );
} else {
boost <- - ( 1 - boost );
}
if(boost > 0) {
return (score + ( ( ( 1 - score ) / 2 ) * boost ))
} else {
return (score + ( ( score / 2 ) * boost ))
}
}
# compute the score
dat$score <- mapply(popqual_score, nrow(dat), dat$incomingLinks, dat$externalLinks, dat$bytes, dat$headings, dat$redirects, dat$tmplBoost, dat$pop_score );
dat$score_qual <- mapply(qual_score, nrow(dat), dat$incomingLinks, dat$externalLinks, dat$bytes, dat$headings, dat$redirects, dat$tmplBoost );
dat$score_pop <- mapply(pop_score, nrow(dat), dat$pop_score );
li <- dat[grepl("^Li", dat$page),]
oba_osa <- dat[grepl("^(Barack|Osama)", dat$page),]
peg <- dat[grepl("^Peg", dat$page),]
Po <- dat[grepl("^Po", dat$page),]
bug <- dat[dat$score_qual > 1,]
# various stuff to explore distribution
#quantile(dat$redirects, 0.999);
summary(dat$score_pop)
summary(dat$score_qual)
#plot(density(dat$incomingLinks, from=0.00001), log="x", xlim=c(1,max(dat$incomingLinks)))
#plot(density(dat$externalLinks, from=0.00001), log="x", xlim=c(1,max(dat$externalLinks)))
#plot(density(dat$bytes, from=0.00001), log="x", xlim=c(1,max(dat$bytes)))
#plot(density(dat$headings, from=0.00001), log="x", xlim=c(1,max(dat$headings)))
#plot(density(dat$redirects, from=0.00001), log="x", xlim=c(1,max(dat$redirects)))
#plot(density(dat$incomingLinks, from=0.00001), log="x", xlim=c(1,max(dat$incomingLinks)))
#mysamp <- dat[sample(1:nrow(dat), 100000, replace=FALSE),]
#plot(mysamp$incomingLinks+10, mysamp$score, log="x", cex=0.2)
#plot(mysamp$bytes+10, mysamp$score, log="x", cex=0.2)
#plot(mysamp$headings, mysamp$score, cex=0.2)
#plot(mysamp$redirects, mysamp$score, cex=0.2)
#plot(mysamp$externalLinks+10, mysamp$score, cex=0.2)
#plot(density(dat$score, from=0.00001), log="x", xlim=c(1,max(dat$score)))
#plot(density(dat$headingBalance))
#plot(density(dat$headings))
#plot(dat$incomingLinks + 10,dat$externalLinks + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$bytes + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$headings + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$headings + 10, log="yx", cex=0.2)
#plot(dat$incomingLinks + 10, dat$redirects + 10, log="yx", cex=0.2)
#plot(dat$score_pop, dat$score, cex=0.2)
#plot(dat$score_qual, dat$score, cex=0.2)