shithub: purgatorio

ref: f5cc6fbe3a7bcf8bdb002c646ddd519014afafd2
dir: /appl/cmd/webgrab.b/

View raw version
# Webgrab -- for getting html pages and the subordinate files (images, frame children)
# they refer to (using "src=..." in a tag) into the local file space.
# Assume http: scheme if none specified.
# Usage:
#	webgrab [-r] [-v] [-o stem] url
#  If stem is specified, file will be saved in stem.html and images will
#  go in stem_1.jpg (or .gif, ...), stem_2.jpg, etc.
#  If stem is not specified, derive it from url (see getstem comment, below).
# If -r is specified, get "raw", i.e., no image fetching/html munging.
# If -v is specified (verbose), print some progress information,
# with more if -vv is given.

implement Webgrab;

include "sys.m";
	sys: Sys;
	FD: import sys;

include "draw.m";

include "string.m";
	S: String;

include "url.m";
	U: Url;
	ParsedUrl: import U;

include "daytime.m";
	DT: Daytime;

include "bufio.m";
	B: Bufio;

include "dial.m";
	D: Dial;

include "arg.m";

Webgrab: module
{
	init: fn(ctxt: ref Draw->Context, args: list of string);
};

stderr: ref FD;
verbose := 0;
postbody : string;

httpproxy: ref Url->ParsedUrl;
noproxydoms: list of string;	# domains that don't require proxy

init(nil: ref Draw->Context, args: list of string)
{
	sys = load Sys Sys->PATH;
	stderr = sys->fildes(2);
	S = load String String->PATH;
	U = load Url Url->PATH;
	DT = load Daytime Daytime->PATH;
	D = load Dial Dial->PATH;
	B = load Bufio Bufio->PATH;
	arg := load Arg Arg->PATH;
	if(S == nil || U == nil || DT == nil || B == nil || arg == nil)
		error_exit("can't load a module");
	U->init();
	stem := "";
	rawflag := 0;
	arg->init(args);
	arg->setusage("webgrab [-r] [-v[v]] [-p postbody] [-o stem] url");
	url := "";
	while((o := arg->opt()) != 0)
		case o {
		'r' =>
			rawflag = 1;
		'v' =>
			verbose++;
		'o' =>
			stem = arg->earg();
		'p' =>
			postbody = arg->earg();
		* =>
			arg->usage();
		}
	args = arg->argv();
	if(len args != 1)
		arg->usage();
	url = hd args;
	arg = nil;
	(nil,xr) := S->splitstrl(url,"//");
	(nil,yr) := S->splitl(url,":");
	if(xr == "" && yr == "")
		url = "http://" + url;
	u := U->makeurl(url);
	if(stem == "")
		stem = getstem(u);
	readconfig();
	grab(u, stem, rawflag);
}

readconfig()
{
	cfgio := B->open("/services/webget/config", sys->OREAD);
	if(cfgio != nil) {
		for(;;) {
			line := B->cfgio.gets('\n');
			if(line == "") {
				B->cfgio.close();
				break;
			}
			if(line[0]=='#')
				continue;
			(key, val) := S->splitl(line, " \t=");
			val = S->take(S->drop(val, " \t="), "^\r\n");
			if(val == "")
				continue;
			case key {
			"httpproxy" =>
				if(val == "none")
					continue;
				# val should be host or host:port
				httpproxy = U->makeurl("http://" + val);
				if(verbose)
					sys->fprint(stderr, "Using http proxy %s\n", httpproxy.tostring());
			"noproxy" or
			"noproxydoms" =>
				(nil, noproxydoms) = sys->tokenize(val, ";, \t");
			}
		}
	}
}

# Make up a stem for forming save-file-names, based on url u.
# Use the last non-nil component of u.path, without a final extension,
# else use the host.  Then, if the stem still contains a '.' (e.g., www.lucent)
# use the part after the final '.'.
# Finally, if all else fails, use use "grabout".
getstem(u: ref ParsedUrl) : string
{
	stem := "";
	if(u.path != "") {
		(l, r) := S->splitr(u.path, "/");
		if(r == "") {
			# path ended with '/'; try next to last component
			if(l != "")
				(l, r) = S->splitr(l[0:len l - 1], "/");
		}
		if(r != "")
			stem = r;
	}
	if(stem == "")
		stem = u.host;
	if(stem != "") {
		ext: string;
		(stem, ext) = S->splitr(stem, ".");
		if(stem == "")
			stem = ext;
		else
			stem = stem[0:len stem - 1];
		(nil, stem) = S->splitr(stem, ".");
	}
	if(stem == "")
		stem = "grabout";
	return stem;
}

grab(u: ref ParsedUrl, stem: string, rawflag: int)
{
	(err, contents, fd, actual) := httpget(u);
	if(err != "")
		error_exit(err);
	ish := is_html(contents);
	if(ish)
		contents = addfetchcomment(contents, u, actual);
	if(rawflag || !ish) {
		writebytes(stem, contents, fd);
		return;
	}
	# get subordinates, modify contents
	subs : list of (string, string);
	(contents, subs)  = subfix(contents, stem);
	writebytes(stem + ".html", contents, fd);
	for(l := subs; l != nil; l = tl l) {
		(fname, suburl) := hd l;
		subu := U->makeurl(suburl);
		subu.makeabsolute(actual);
		(suberr, subcontents, subfd, nil) := httpget(subu);
		if(suberr != "") {
			sys->fprint(stderr, "webgrab: can't fetch subordinate %s from %s: %s\n", fname, subu.tostring(), suberr);
			continue;
		}
		writebytes(fname, subcontents, subfd);
	}
}

# Fix the html in array a so that referenced subordinate files (SRC= or BACKGROUND= fields of tags)
# are replaced with local names (stem_1.xxx, stem_2.xxx, etc.),
# and return the fixed array along with a list of (local name, subordinate url)
# of images to be fetched.
subfix(a: array of byte, stem: string) : (array of byte, list of (string, string))
{
	alen := len a;
	if(alen == 0)
		return (a, nil);
	nsubs := 0;
	newa := array[alen + 1000] of byte;
	newai := 0;
	j := 0;
	intag := 0;
	incom := 0;
	quote := 0;
	subs : list of (string, string) = nil;
	for(i := 0; i < alen; i++) {
		c := int a[i];
		if(incom) {
			if(amatch(a, i, alen, "-->")) {
				incom = 0;
				i = i+2;
			}
		}
		else if(intag) {
			if(quote==0 && (amatch(a, i, alen, "src") || amatch(a, i, alen, "background"))) {
				v := "";
				eqi := 0;
				if(amatch(a, i, alen, "src"))
					k := i+3;
				else
					k = i+10;
				for(; k < alen; k++)
					if(!iswhite(int a[k]))
						break;
				if(k < alen && int a[k] == '=') {
					eqi = k;
					k++;
					while(k<alen && iswhite(int a[k]))
						k++;
					if(k<alen) {
						kstart := k;
						c = int a[k];
						if(c == '\'' || c== '"') {
							quote = int a[k++];
							while(k<alen && (int a[k])!=quote)
								k++;
							v = string a[kstart+1:k];
							k++;
						}
						else {
							while(k<alen && !iswhite(int a[k]) && int a[k] != '>')
								k++;
							v = string a[kstart:k];
						}
					}
				}
				if(v != "") {
					f := "";
					for(l := subs; l != nil; l = tl l) {
						(ff,uu) := hd l;
						if(v == uu) {
							f = ff;
							break;
						}
					}
					if(f == "") {
						nsubs++;
						f = stem + "_" + string nsubs + getsuff(v);
						subs = (f, v) :: subs;
					}
					# should check for newa too small
					newa[newai:] = a[j:eqi+1];
					newai += eqi+1-j;
					xa := array of byte f;
					newa[newai:] = xa;
					newai += len xa;
					j = k;
				}
				i = k-1;
			}
			if(c == '>' && quote == 0)
				intag = 0;
			if(quote) {
				if(quote == c)
					quote = 0;
			else if(c == '"' || c == '\'')
				quote = c;
			}
		}
		else if(c == '<')
			intag = 1;
	}
	if(nsubs == 0)
		return (a, nil);
	if(i > j) {
		newa[newai:] = a[j:i];
		newai += i-j;
	}
	ans := array[newai] of byte;
	ans[0:] = newa[0:newai];
	anssubs : list of (string, string) = nil;
	for(ll := subs; ll != nil; ll = tl ll)
		anssubs = hd ll :: anssubs;
	return (ans, anssubs);
}

# add c after all f's in a
fixnames(a: array of byte, f: string, c: byte)
{
	alen := len a;
	n := alen - len f;
	for(i := 0; i < n; i++) {
		if(amatch(a, i, alen, f)) {
			a[i+len f] = c;
		}
	}
}

amatch(a: array of byte, i, alen: int, s: string) : int
{
	slen := len s;
	for(k := 0; i+k < alen && k < slen; k++) {
		c := int a[i+k];
		if(c >= 'A' && c <= 'Z')
			c = c + (int 'a' - int 'A');
		if(c != s[k])
			break;
	}
	if(k == slen) {
		return 1;
	}
	return 0;
}

getsuff(ustr: string) : string
{
	u := U->makeurl(ustr);
	if(u.path != "") {
		for(i := len u.path - 1; i >= 0; i--) {
			c := u.path[i];
			if(c == '.')
				return u.path[i:];
			if(c == '/')
				break;
		}
	}
	return "";
}

iswhite(c: int) : int
{
	return (c==' ' || c=='\t' || c=='\n' || c=='\r');
}

# Add a comment to end of a giving date and source of fetch
addfetchcomment(a: array of byte, u, actu: ref ParsedUrl) : array of byte
{
	now := DT->text(DT->local(DT->now()));
	ustr := u.tostring();
	actustr := actu.tostring();
	comment := "\n<!-- Fetched " + now + " from " + ustr;
	if(ustr != actustr)
		comment += ", redirected to " + actustr;
	comment += " -->\n";
	acom := array of byte comment;
	newa := array[len a + len acom] of byte;
	newa[0:] = a;
	newa[len a:] = acom;
	return newa;
}

# Get u, return (error string, body, actual url of source, after redirection)
httpget(u: ref ParsedUrl) : (string, array of byte, ref Sys->FD, ref ParsedUrl)
{
	ans, body : array of byte;
	restfd: ref Sys->FD;
	req : string;
	
	for(redir := 0; redir < 10; redir++) {
		if(u.port == "")
			u.port = "80";	# default IP port for HTTP
		if(verbose)
			sys->fprint(stderr, "connecting to %s\n", u.host);
		dialhost, port: string;

		if(httpproxy != nil && need_proxy(u.host)) {
			dialhost = httpproxy.host;
			port = httpproxy.port;
		}
		else {
			dialhost = u.host;
			port = u.port;
		}
		dest := D->netmkaddr(dialhost, "tcp", port);
		net := D->dial(dest, nil);
		if(net == nil)
			return (sys->sprint("can't dial %s: %r", dest), nil, nil, nil);
			
		# prepare request
		if(u.query != ""){
			u.query = "?" + u.query;
		}

		if (postbody == nil){
			if(httpproxy == nil || !need_proxy(u.host)){
				req = sys->sprint("GET /%s%s HTTP/1.0\r\n"+
						"Host: %s\r\n"+
						"User-agent: Inferno/webgrab\r\n"+
						"Cache-Control: no-cache\r\n"+
						"Pragma: no-cache\r\n\r\n",
						u.path, u.query, u.host);
			}else{
				req = sys->sprint("GET http:///%s%s HTTP/1.0\r\n"+
						"Host: %s\r\n"+
						"User-agent: Inferno/webgrab\r\n"+
						"Cache-Control: no-cache\r\n"+
						"Pragma: no-cache\r\n\r\n",
						u.host, u.path, u.host);
			}
		}else{
				req = sys->sprint("POST /%s HTTP/1.0\r\n"+
						"Host: %s\r\n"+
						"Content-type: application/x-www-form-urlencoded\r\n"+
						"Content-length: %d\r\n"+
						"User-agent: Inferno/webgrab\r\n"+
						"\r\n"+"%s",
						u.path, u.host, len postbody, postbody);

		}

		if(verbose)
			sys->fprint(stderr, "writing request: %s\n", req);
		areq := array of byte req;
		n := sys->write(net.dfd, areq, len areq);
		if(n != len areq)
			return (sys->sprint("write problem: %r"), nil, nil, nil);
		(ans, restfd) = readbytes(net.dfd);
		(status, rest) := stripline(ans);
		if(verbose)
			sys->fprint(stderr, "response: %s\n", status);
		(vers, statusrest) := S->splitl(status, " ");
		if(!S->prefix("HTTP/", vers))
			return ("bad reply status: " + status, rest, restfd, nil);
		code := int statusrest;
		location := "";
		body = rest;
		for(;;) {
			hline: string;
			(hline, body) = stripline(body);
			if(hline == "")
				break;
			if(verbose > 1)
				sys->fprint(stderr, "%s\n", hline);
			if(!iswhite(hline[0])) {
				(hname, hrest) := S->splitl(hline, ":");
				if(hrest != "") {
					hname = S->tolower(hname);
					hval := S->drop(hrest, ": \t");
					hval = S->take(hval, "^ \t");
					if(hname == "location")
						location = hval;
				}
			}
		}
		if(code != 200) {
			if((code == 300 || code == 301 || code == 302) && location != "") {
				# MultipleChoices, MovedPerm, or MovedTemp
				if(verbose)
					sys->fprint(stderr, "redirect to %s\n", location);
				u = U->makeurl(location);
				continue; 
			}
			return ("status not ok: " + status, rest, restfd, u);
		}
		break;
	}
	return ("", body, restfd, u);
}


need_proxy(h: string) : int
{
	doml := noproxydoms;
	if(doml == nil)
		return 1;		# all domains need proxy

	lh := len h;
	for(dom := hd doml; doml != nil; doml = tl doml) {
		ld := len dom;
		if(lh >= ld && h[lh-ld:] == dom)
			return 0;	# domain is on the noproxy list
	}

	return 1;
}

# Simple guess test for HTML: first non-white byte is '<'
is_html(a: array of byte) : int
{
	for(i := 0; i < len a; i++)
		if(!iswhite(int a[i]))
			break;
	if(i < len a && a[i] == byte '<')
		return 1;
	return 0;
}

readbytes(fd: ref Sys->FD) : (array of byte, ref Sys->FD)
{
	buf := array[Sys->ATOMICIO] of byte;
	i := 0;
	avail := len buf;
	while (avail > 0) {
		n := sys->read(fd, buf[i:], avail);
		if(n <= 0) {
			fd = nil;
			break;
		}
		i += n;
		avail -= n;
	}
	return (buf[0:i], fd);
}

writebytes(f: string, a: array of byte, fd: ref Sys->FD)
{
	ofd: ref Sys->FD;
	if (f == "-")
		ofd = sys->fildes(1);
	else
		ofd = sys->create(f, Sys->OWRITE, 8r666);
	if(ofd == nil) {
		sys->fprint(stderr, "webgrab: can't create %s: %r\n", f);
		return;
	}
	i := 0;
	clen := len a;
	while(i < clen) {
		n := sys->write(ofd, a[i:], clen-i);
		if(n < 0) {
			sys->fprint(stderr, "webgrab: write error: %r\n");
			return;
		}
		i += n;
	}
	if(fd != nil) {
		buf := array[Sys->ATOMICIO] of byte;
		while((n := sys->read(fd, buf, len buf)) > 0) {
			if(sys->write(ofd, buf, n) != n) {
				sys->fprint(stderr, "webgrab: write error: %r\n");
				return;
			}
		}
		if(n < 0) {
			sys->fprint(stderr, "webgrab: read error: %r\n");
			return;
		}
		clen += n;
	}
	if (f != "-")
		sys->fprint(stderr, "created %s, %d bytes\n", f, clen);
}

stripline(b: array of byte) : (string, array of byte)
{
	n := len b - 1;
	for(i := 0; i < n; i++)
		if(b[i] == byte '\r' && b[i+1] == byte '\n')
			return (string b[0:i], b[i+2:]);
	return ("", b);
}

error_exit(msg: string)
{
	sys->fprint(sys->fildes(2), "%s\n", msg);
	raise "fail:error";
}