shithub: riscv

Download patch

ref: d2a7d886624c56673a6d7ba7d6a7958d2be5b867
parent: c14ea9fdd1521ff9322f9af71b801e016622c0cd
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sat Mar 12 15:53:17 EST 2022

devip: implement network address translation routes

This adds a new route "t"-flag that enables network address translation,
replacing the source address (and local port) of a forwarded packet to
one of the outgoing interface.

The state for a translation is kept in a new Translation structure,
which contains two Iphash entries, so it can be inserted into the
per protocol 4-tuple hash table, requiering no extra lookups.

Translations have a low overhead (~200 bytes on amd64),
so we can have many of them. They get reused after 5 minutes
of inactivity or when the per protocol limit of 1000 entries
is reached (then the one with longest inactivity is reused).

The protocol needs to export a "forward" function that is responsible
for modifying the forwarded packet, and then handle translations in
its input function for iphash hits with Iphash.trans != 0.

This patch also fixes a few minor things found during development:

- Include the Iphash in the Conv structure, avoiding estra malloc
- Fix ttl exceeded check (ttl < 1 -> ttl <= 1)
- Router should not reply with ttl exceeded for multicast flows
- Extra checks for icmp advice to avoid protocol confusions.

--- a/sys/man/3/ip
+++ b/sys/man/3/ip
@@ -411,6 +411,9 @@
 .TP
 .B p
 point-to-point route
+.TP
+.B t
+network address translation on source
 .PD
 .PP
 The tag is an arbitrary, up to 4 character, string.  It is normally used to
@@ -442,7 +445,7 @@
 .TP
 .BI add\  "target mask nexthop tag interface source smask"
 .TP
-.BI add\  "target mask nexthop type tag interface source smask"
+.BI add\  "target mask nexthop flags tag interface source smask"
 Add the route to the table.  If one already exists with the
 same target and mask, replace it. The
 .I interface
@@ -461,7 +464,7 @@
 .TP
 .BI remove\  "target mask nexthop tag interface source smask"
 .TP
-.BI remove\  "target mask nexthop type tag interface source smask"
+.BI remove\  "target mask nexthop flags tag interface source smask"
 Remove the matching route.
 .
 .SS "Address resolution
--- a/sys/src/9/ip/devip.c
+++ b/sys/src/9/ip/devip.c
@@ -737,6 +737,7 @@
 char*
 setluniqueport(Conv* c, int lport)
 {
+	Translation *q;
 	Proto *p;
 	Conv *xp;
 	int x;
@@ -754,14 +755,22 @@
 		&& xp->lport == lport
 		&& xp->rport == c->rport
 		&& ipcmp(xp->raddr, c->raddr) == 0
-		&& ipcmp(xp->laddr, c->laddr) == 0){
-			qunlock(p);
-			return "address in use";
-		}
+		&& ipcmp(xp->laddr, c->laddr) == 0)
+			goto Inuse;
 	}
+	for(q = p->translations; q != nil; q = q->next){
+		if(q->backward.lport == lport
+		&& q->backward.rport == c->rport
+		&& ipcmp(q->backward.raddr, c->raddr) == 0
+		&& ipcmp(q->backward.laddr, c->laddr) == 0)
+			goto Inuse;
+	}
 	c->lport = lport;
 	qunlock(p);
 	return nil;
+Inuse:
+	qunlock(p);
+	return "address in use";
 }
 
 /*
@@ -770,18 +779,51 @@
 static int
 lportinuse(Proto *p, ushort lport)
 {
+	Translation *q;
 	int x;
 
 	for(x = 0; x < p->nc && p->conv[x]; x++)
 		if(p->conv[x]->lport == lport)
 			return 1;
+	for(q = p->translations; q != nil; q = q->next)
+		if(q->backward.lport == lport)
+			return 1;
 	return 0;
 }
 
+/* 
+ *  find a unused loacal port for a protocol.
+ *
+ *  p needs to be locked
+ */
+int
+unusedlport(Proto *p)
+{
+	ushort port;
+	int i;
+
+	/*
+	 * Unrestricted ports are chosen randomly
+	 * between 2^15 and 2^16.  There are at most
+	 * 4*Nchan = 4096 ports in use at any given time,
+	 * so even in the worst case, a random probe has a
+	 * 1 - 4096/2^15 = 87% chance of success.
+	 * If 64 successive probes fail, there is a bug somewhere
+	 * (or a once in 10^58 event has happened, but that's
+	 * less likely than a venti collision).
+	 */
+	for(i=0; i<64; i++){
+		port = (1<<15) + nrand(1<<15);
+		if(!lportinuse(p, port))
+			return port;
+	}
+	return -1;
+}
+
 /*
  *  pick a local port and set it
  */
-char *
+static char *
 setlport(Conv* c)
 {
 	Proto *p;
@@ -799,21 +841,9 @@
 				goto chosen;
 		}
 	}else{
-		/*
-		 * Unrestricted ports are chosen randomly
-		 * between 2^15 and 2^16.  There are at most
-		 * 4*Nchan = 4096 ports in use at any given time,
-		 * so even in the worst case, a random probe has a
-		 * 1 - 4096/2^15 = 87% chance of success.
-		 * If 64 successive probes fail, there is a bug somewhere
-		 * (or a once in 10^58 event has happened, but that's
-		 * less likely than a venti collision).
-		 */
-		for(i=0; i<64; i++){
-			port = (1<<15) + nrand(1<<15);
-			if(!lportinuse(p, port))
-				goto chosen;
-		}
+		port = unusedlport(p);
+		if(port > 0)
+			goto chosen;
 	}
 	qunlock(p);
 	return "no ports available";
--- a/sys/src/9/ip/icmp.c
+++ b/sys/src/9/ip/icmp.c
@@ -99,6 +99,8 @@
 	/* message counts */
 	ulong	in[Maxtype+1];
 	ulong	out[Maxtype+1];
+
+	Ipht	ht;
 };
 
 static void icmpkick(void *x, Block*);
@@ -192,9 +194,9 @@
 	uchar addr[IPaddrlen];
 	int i;
 
-	v4tov6(addr, ip4);
-	if(ipismulticast(addr))
+	if(isv4mcast(ip4))
 		return 0;
+	v4tov6(addr, ip4);
 	i = ipforme(f, addr);
 	return i == 0 || i == Runi;
 }
@@ -204,9 +206,9 @@
 {
 	uchar addr[IPaddrlen];
 
-	v4tov6(addr, ip4);
-	if(ipismulticast(addr))
+	if(isv4mcast(ip4))
 		return 0;
+	v4tov6(addr, ip4);
 	return ipforme(f, addr) == Runi;
 }
 
@@ -218,7 +220,7 @@
 	uchar	ia[IPv4addrlen];
 
 	p = (Icmp *)bp->rp;
-	if(!ip4reply(f, p->src) || !ipv4local(ifc, ia, 0, p->src))
+	if(isv4mcast(p->dst) || !ip4reply(f, p->src) || !ipv4local(ifc, ia, 0, p->src))
 		return;
 
 	netlog(f, Logicmp, "sending icmpttlexceeded %V -> src %V dst %V\n",
@@ -249,7 +251,7 @@
 	uchar	ia[IPv4addrlen];
 
 	p = (Icmp *)bp->rp;
-	if(!ip4reply(f, p->src))
+	if(isv4mcast(p->dst) || !ip4reply(f, p->src))
 		return;
 
 	if(ifc == nil){
@@ -302,21 +304,43 @@
 static void
 goticmpkt(Proto *icmp, Block *bp)
 {
-	ushort	recid;
 	uchar	dst[IPaddrlen], src[IPaddrlen];
+	ushort	recid;
 	Conv	**c, *s;
+	Iphash	*iph;
 	Icmp	*p;
-
-	p = (Icmp *) bp->rp;
+	
+	p = (Icmp *)bp->rp;
 	v4tov6(dst, p->dst);
 	v4tov6(src, p->src);
 	recid = nhgets(p->icmpid);
 
+	qlock(icmp);
+	iph = iphtlook(&((Icmppriv*)icmp->priv)->ht, src, recid, dst, recid);
+	if(iph != nil){
+		Translation *q;
+		int hop = p->ttl;
+
+		if(hop <= 1 || (q = transbackward(icmp, iph)) == nil)
+			goto raise;
+		memmove(p->dst, q->forward.raddr+IPv4off, IPv4addrlen);
+		hnputs_csum(p->icmpid, q->forward.rport, p->cksum);
+
+		/* only use route-hint when from original desination */
+		if(memcmp(p->src, q->forward.laddr+IPv4off, IPv4addrlen) != 0)
+			q = nil;
+		qunlock(icmp);
+
+		ipoput4(icmp->f, bp, 1, hop - 1, p->tos, q);
+		return;
+	}
 	for(c = icmp->conv; (s = *c) != nil; c++){
 		if(s->lport == recid)
 		if(ipcmp(s->laddr, dst) == 0 || ipcmp(s->raddr, src) == 0)
 			qpass(s->rq, copyblock(bp, blocklen(bp)));
 	}
+raise:
+	qunlock(icmp);
 	freeblist(bp);
 }
 
@@ -404,6 +428,13 @@
 		ipriv->out[EchoReply]++;
 		ipoput4(icmp->f, r, 0, MAXTTL, DFLTTOS, nil);
 		break;
+	case TimeExceed:
+		if(p->code == 0){
+			snprint(msg = m2, sizeof m2, "ttl exceeded at %V", p->src);
+			goto Advise;
+		}
+		goticmpkt(icmp, bp);
+		break;
 	case Unreachable:
 		if(p->code >= nelem(unreachcode)) {
 			snprint(m2, sizeof m2, "unreachable %V -> %V code %d",
@@ -411,7 +442,6 @@
 			msg = m2;
 		} else
 			msg = unreachcode[p->code];
-
 	Advise:
 		bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE;
 		if(BLEN(bp) < MinAdvise){
@@ -419,40 +449,80 @@
 			goto raise;
 		}
 		p = (Icmp *)bp->rp;
-		if((nhgets(p->frag) & IP_FO) == 0){
+		if(p->vihl == (IP_VER4|IP_HLEN4)	/* advise() does not expect options */
+		&& (nhgets(p->frag) & IP_FO) == 0	/* first fragment */
+		&& ipcsum(&p->vihl) == 0){
 			pr = Fsrcvpcolx(icmp->f, p->proto);
 			if(pr != nil && pr->advise != nil) {
+				netlog(icmp->f, Logicmp, "advising %s!%V -> %V: %s\n", pr->name, p->src, p->dst, msg);
 				(*pr->advise)(pr, bp, msg);
 				return;
 			}
 		}
 		bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE;
-		goticmpkt(icmp, bp);
-		break;
-	case TimeExceed:
-		if(p->code == 0){
-			snprint(msg = m2, sizeof m2, "ttl exceeded at %V", p->src);
-			goto Advise;
-		}
-		goticmpkt(icmp, bp);
-		break;
+		/* wet floor */
 	default:
 		goticmpkt(icmp, bp);
 		break;
 	}
 	return;
-
 raise:
 	freeblist(bp);
 }
 
+/*
+ * called from protocol advice handlers when the advice
+ * is actually for someone we source translate (ip4).
+ * the caller has fixed up the ip address and ports
+ * in the inner header, so we just restore the outer
+ * ip/icmp headers, recalculating icmp checksum
+ * and send the advice to ip4.
+ */
+void
+icmpproxyadvice(Fs *f, Block *bp, uchar *ip4)
+{
+	Icmp	*p;
+	int	hop;
+
+	/* inner header */
+	p = (Icmp *) bp->rp;
+	if(p->vihl != (IP_VER4|IP_HLEN4))
+		goto drop;
+	if(ipcsum(&p->vihl) != 0)
+		goto drop;
+
+	/* outer header */
+	bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE;
+	p = (Icmp *) bp->rp;
+	if(p->vihl != (IP_VER4|(ICMP_IPSIZE>>2)))
+		goto drop;
+
+	hop = p->ttl;
+	if(hop <= 1)
+		goto drop;
+
+	netlog(f, Logicmp|Logtrans, "proxying icmp advice from %V to %V->%V\n",
+		p->src, p->dst, ip4);
+	memmove(p->dst, ip4, IPv4addrlen);
+
+	/* recalculate ICMP checksum */
+	memset(p->cksum, 0, sizeof(p->cksum));
+	hnputs(p->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE));
+
+	ipoput4(f, bp, 1, hop - 1, p->tos, nil);
+	return;
+drop:
+	freeblist(bp);
+}
+
 static void
 icmpadvise(Proto *icmp, Block *bp, char *msg)
 {
-	ushort	recid;
 	uchar	dst[IPaddrlen], src[IPaddrlen];
+	ushort	recid;
 	Conv	**c, *s;
 	Icmp	*p;
+	Iphash	*iph;
 
 	p = (Icmp *) bp->rp;
 	v4tov6(dst, p->dst);
@@ -459,6 +529,23 @@
 	v4tov6(src, p->src);
 	recid = nhgets(p->icmpid);
 
+	qlock(icmp);
+	iph = iphtlook(&((Icmppriv*)icmp->priv)->ht, dst, recid, src, recid);
+	if(iph != nil){
+		Translation *q;
+
+		if((q = transbackward(icmp, iph)) == nil)
+			goto raise;
+
+		hnputs_csum(p->src+0, nhgets(q->forward.raddr+IPv4off+0), p->ipcksum);
+		hnputs_csum(p->src+2, nhgets(q->forward.raddr+IPv4off+2), p->ipcksum);
+
+		hnputs_csum(p->icmpid, q->forward.rport, p->cksum);
+		qunlock(icmp);
+
+		icmpproxyadvice(icmp->f, bp, p->src);
+		return;
+	}
 	for(c = icmp->conv; (s = *c) != nil; c++){
 		if(s->lport == recid)
 		if(ipcmp(s->laddr, src) == 0)
@@ -470,9 +557,38 @@
 			break;
 		}
 	}
+raise:
+	qunlock(icmp);
 	freeblist(bp);
 }
 
+static Block*
+icmpforward(Proto *icmp, Block *bp, Route *r)
+{
+	uchar da[IPaddrlen], sa[IPaddrlen];
+	ushort id;
+	Icmp *p;
+	Translation *q;
+
+	p = (Icmp*)(bp->rp);
+	v4tov6(sa, p->src);
+	v4tov6(da, p->dst);
+	id = nhgets(p->icmpid);
+
+	qlock(icmp);
+	q = transforward(icmp, &((Icmppriv*)icmp->priv)->ht, sa, id, da, id, r);
+	if(q == nil){
+		qunlock(icmp);
+		freeblist(bp);
+		return nil;
+	}
+	memmove(p->src, q->backward.laddr+IPv4off, IPv4addrlen);
+	hnputs_csum(p->icmpid, q->backward.lport, p->cksum);
+	qunlock(icmp);
+
+	return bp;
+}
+
 static int
 icmpstats(Proto *icmp, char *buf, int len)
 {
@@ -511,6 +627,7 @@
 	icmp->stats = icmpstats;
 	icmp->ctl = nil;
 	icmp->advise = icmpadvise;
+	icmp->forward = icmpforward;
 	icmp->gc = nil;
 	icmp->ipproto = IP_ICMPPROTO;
 	icmp->nc = 128;
--- a/sys/src/9/ip/icmp6.c
+++ b/sys/src/9/ip/icmp6.c
@@ -711,9 +711,8 @@
 			goto raise;
 		}
 		p = (IPICMP *)bp->rp;
-
 		/* get rid of fragment header if this is the first fragment */
-		if(p->proto == FH && BLEN(bp) >= MinAdvise+IP6FHDR && MinAdvise > IP6HDR){
+		if((p->vcf[0] & 0xF0) == IP_VER6 && p->proto == FH && BLEN(bp) >= MinAdvise+IP6FHDR && MinAdvise > IP6HDR){
 			Fraghdr6 *fh = (Fraghdr6*)(bp->rp + IP6HDR);
 			if((nhgets(fh->offsetRM) & ~7) == 0){	/* first fragment */
 				p->proto = fh->nexthdr;
@@ -725,9 +724,10 @@
 				bp->rp -= IP6HDR;
 			}
 		}
-		if(p->proto != FH){
+		if((p->vcf[0] & 0xF0) == IP_VER6 && p->proto != FH){
 			pr = Fsrcvpcolx(icmp->f, p->proto);
 			if(pr != nil && pr->advise != nil) {
+				netlog(icmp->f, Logicmp, "advising %s!%I -> %I: %s\n", pr->name, p->src, p->dst, msg);
 				(*pr->advise)(pr, bp, msg);
 				return;
 			}
--- a/sys/src/9/ip/il.c
+++ b/sys/src/9/ip/il.c
@@ -308,8 +308,8 @@
 	ic = (Ilcb*)c->ptcl;
 	ic->state = Ilclosed;
 	iphtrem(&ipriv->ht, c);
-	ipmove(c->laddr, IPnoaddr);
 	c->lport = 0;
+	ipmove(c->laddr, IPnoaddr);
 }
 
 static void
@@ -544,6 +544,7 @@
 	uchar laddr[IPaddrlen];
 	ushort sp, dp, csum;
 	int plen, illen;
+	Iphash *iph;
 	Conv *new, *s;
 	Ilpriv *ipriv;
 
@@ -584,14 +585,14 @@
 	}
 
 	qlock(il);
-	s = iphtlook(&ipriv->ht, raddr, dp, laddr, sp);
-	if(s == nil){
+	iph = iphtlook(&ipriv->ht, raddr, dp, laddr, sp);
+	if(iph == nil){
 		if(ih->iltype == Ilsync)
 			ilreject(il->f, ih);		/* no listener */
 		qunlock(il);
 		goto raise;
 	}
-
+	s = iphconv(iph);
 	ic = (Ilcb*)s->ptcl;
 	if(ic->state == Illistening){
 		if(ih->iltype != Ilsync){
--- a/sys/src/9/ip/ip.c
+++ b/sys/src/9/ip/ip.c
@@ -252,7 +252,7 @@
 void
 ipiput4(Fs *f, Ipifc *ifc, Block *bp)
 {
-	int hl, len, hop, tos;
+	int hl, len, hop;
 	uchar v6dst[IPaddrlen];
 	ushort frag;
 	Ip4hdr *h;
@@ -327,27 +327,53 @@
 
 		/* don't forward if packet has timed out */
 		hop = h->ttl;
-		if(hop < 1) {
+		if(hop <= 1) {
 			ip->stats[InHdrErrors]++;
 			icmpttlexceeded(f, ifc, bp);
 			goto drop;
 		}
 
-		/* reassemble if the interface expects it */
-		if(nifc->reassemble){
+		if(r->type & Rtrans) {
+			p = Fsrcvpcolx(f, h->proto);
+			if(p == nil || p->forward == nil){
+				ip->stats[OutDiscards]++;
+				goto drop;
+			}
+
+			if(hl > IP4HDR) {
+				hl -= IP4HDR;
+				len -= hl;
+				bp->rp += hl;
+				memmove(bp->rp, h, IP4HDR);
+				h = (Ip4hdr*)bp->rp;
+				h->vihl = IP_VER4|IP_HLEN4;
+				hnputs(h->length, len);
+			}
+
 			frag = nhgets(h->frag);
 			if(frag & (IP_MF|IP_FO)) {
 				bp = ip4reassemble(ip, frag, bp);
 				if(bp == nil)
 					return;
+			}
+
+			bp = (*p->forward)(p, bp, r);
+			if(bp == nil)
+				return;
+			h = (Ip4hdr*)bp->rp;
+		} else if(nifc->reassemble) {
+			/* reassemble as the interface expects it */
+			frag = nhgets(h->frag);
+			if(frag & (IP_MF|IP_FO)) {
+				bp = ip4reassemble(ip, frag, bp);
+				if(bp == nil)
+					return;
 				h = (Ip4hdr*)bp->rp;
 			}
 		}
 
 		ip->stats[ForwDatagrams]++;
-		tos = h->tos;
-		hop = h->ttl;
-		ipoput4(f, bp, 1, hop - 1, tos, &rh);
+		ipoput4(f, bp, 1, hop - 1, h->tos, &rh);
 		return;
 	}
 
--- a/sys/src/9/ip/ip.h
+++ b/sys/src/9/ip/ip.h
@@ -22,6 +22,7 @@
 typedef struct	Arp Arp;
 typedef struct	Route	Route;
 typedef struct	Routehint Routehint;
+typedef struct	Translation Translation;
 
 typedef struct	Routerparams	Routerparams;
 typedef struct 	Hostparams	Hostparams;
@@ -178,6 +179,71 @@
 };
 
 /*
+ *  hash table for 2 ip addresses + 2 ports
+ */
+enum
+{
+	Nipht=		521,	/* convenient prime */
+
+	IPmatchexact=	0,	/* match on 4 tuple */
+	IPmatchany,		/* *!* */
+	IPmatchport,		/* *!port */
+	IPmatchaddr,		/* addr!* */
+	IPmatchpa,		/* addr!port */
+};
+
+struct Iphash
+{
+	Iphash	*nextiphash;
+
+	uchar	trans;			/* 0 = conv, 1 = foward, 2 = backward */
+	uchar	match;
+	ushort	lport;			/* local port number */
+	ushort	rport;			/* remote port number */
+	uchar	laddr[IPaddrlen];	/* local IP address */
+	uchar	raddr[IPaddrlen];	/* remote IP address */
+};
+
+struct Ipht
+{
+	Lock;
+	Iphash	*tab[Nipht];
+};
+
+void iphtadd(Ipht*, Iphash*);
+void iphtrem(Ipht*, Iphash*);
+Iphash *iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp);
+
+/*
+ * NAT entry.
+ *
+ * This holds the 5 tuple as two Iphashes.
+ * The "forward" hash matches the the packets from
+ * the source that need to be translated and
+ * "backward" matches the packets coming back
+ * from the destination.
+ */
+struct Translation
+{
+	Translation *next;
+	Translation **link;
+
+	ulong	time;
+
+	Iphash	forward;
+#define iphforward(h) ((Translation*)((char*)(h) - (char*)&((Translation*)0)->forward))
+
+	Iphash	backward;
+#define iphbackward(h) ((Translation*)((char*)(h) - (char*)&((Translation*)0)->backward))
+
+	/* used for forwarding to the source */
+	Routehint;
+};
+
+Translation *transforward(Proto *p, Ipht *ht, uchar *sa, int sp, uchar *da, int dp, Route *r);
+Translation *transbackward(Proto *p, Iphash *iph);
+
+/*
  *  one per conversation directory
  */
 struct Conv
@@ -187,17 +253,16 @@
 	int	x;			/* conversation index */
 	Proto*	p;
 
-	int	restricted;		/* remote port is restricted */
-	int	ignoreadvice;		/* don't terminate connection on icmp errors */
 	uint	ttl;			/* max time to live */
 	uint	tos;			/* type of service */
 
+	uchar	restricted;		/* remote port is restricted */
+	uchar	ignoreadvice;		/* don't terminate connection on icmp errors */
 	uchar	ipversion;
-	uchar	laddr[IPaddrlen];	/* local IP address */
-	uchar	raddr[IPaddrlen];	/* remote IP address */
-	ushort	lport;			/* local port number */
-	ushort	rport;			/* remote port number */
 
+	Iphash;
+#define iphconv(h) ((Conv*)((char*)(h) - (char*)&((Conv*)0)->Iphash))
+
 	char	*owner;			/* protections */
 	int	perm;
 	int	inuse;			/* opens of listen/data/ctl */
@@ -206,7 +271,6 @@
 
 	/* udp specific */
 	int	headers;		/* data src/dst headers in udp */
-	int	reliable;		/* true if reliable udp */
 
 	Conv*	incall;			/* calls waiting to be listened for */
 	Conv*	next;
@@ -352,34 +416,6 @@
 };
 
 /*
- *  hash table for 2 ip addresses + 2 ports
- */
-enum
-{
-	Nipht=		521,	/* convenient prime */
-
-	IPmatchexact=	0,	/* match on 4 tuple */
-	IPmatchany,		/* *!* */
-	IPmatchport,		/* *!port */
-	IPmatchaddr,		/* addr!* */
-	IPmatchpa,		/* addr!port */
-};
-struct Iphash
-{
-	Iphash	*next;
-	Conv	*c;
-	int	match;
-};
-struct Ipht
-{
-	Lock;
-	Iphash	*tab[Nipht];
-};
-void iphtadd(Ipht*, Conv*);
-void iphtrem(Ipht*, Conv*);
-Conv* iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp);
-
-/*
  *  one per multiplexed protocol
  */
 struct Proto
@@ -412,10 +448,16 @@
 	Qid		qid;		/* qid for protocol directory */
 	ushort		nextrport;
 
+	/* network address translation */
+	Translation*	translations;
+	Block*		(*forward)(Proto*, Block*, Route*);
+
 	void		*priv;
 };
 
+int unusedlport(Proto *p);
 
+
 /*
  *  one per IP protocol stack
  */
@@ -489,6 +531,7 @@
 	Logrudpmsg=	1<<16,
 	Logesp=		1<<17,
 	Logtcpwin=	1<<18,
+	Logtrans=	1<<19,
 };
 
 void	netloginit(Fs*);
@@ -522,7 +565,9 @@
 	Rbcast=		(1<<4),		/* a broadcast self address */
 	Rmulti=		(1<<5),		/* a multicast self address */
 	Rproxy=		(1<<6),		/* this route should be proxied */
-	Rsrc=		(1<<7),		/* source specific route */
+	Rtrans=		(1<<7),		/* this route translates source address (NAT) */
+
+	Rsrc=		(1<<8),		/* source specific route */
 };
 
 struct	RouteTree
@@ -533,7 +578,7 @@
 	Ipifc	*ifc;
 	uchar	ifcid;		/* must match ifc->id */
 	uchar	depth;
-	uchar	type;
+	ushort	type;
 	char	tag[4];
 	int	ref;
 };
@@ -641,10 +686,13 @@
 extern void	v4tov6(uchar *v6, uchar *v4);
 extern int	v6tov4(uchar *v4, uchar *v6);
 extern int	eipfmt(Fmt*);
+extern int	ipismulticast(uchar *ip);
 extern int	convipvers(Conv *c);
+extern void	hnputs_csum(void *p, ushort v, uchar *pcsum);
 
 #define	ipmove(x, y) memmove(x, y, IPaddrlen)
 #define	ipcmp(x, y) ( (x)[IPaddrlen-1] != (y)[IPaddrlen-1] || memcmp(x, y, IPaddrlen) )
+#define	isv4mcast(ip4)	((ip4)[0] >= 0xe0 && (ip4)[0] < 0xf0)
 
 extern uchar IPv4bcast[IPaddrlen];
 extern uchar IPv4bcastobs[IPaddrlen];
@@ -670,7 +718,6 @@
 extern void	addipmedium(Medium *med);
 extern void	ipifcoput(Ipifc *ifc, Block *bp, int version, uchar *ip, Routehint *rh);
 extern int	ipforme(Fs*, uchar *addr);
-extern int	ipismulticast(uchar *ip);
 extern Ipifc*	findipifc(Fs*, uchar *local, uchar *remote, int type);
 extern Ipifc*	findipifcstr(Fs *f, char *s);
 extern void	findlocalip(Fs*, uchar *local, uchar *remote);
@@ -694,6 +741,8 @@
 extern void	icmpnoconv(Fs*, Block*);
 extern void	icmpcantfrag(Fs*, Block*, int);
 extern void	icmpttlexceeded(Fs*, Ipifc*, Block*);
+extern void	icmpproxyadvice(Fs *, Block*, uchar*);
+
 extern ushort	ipcsum(uchar*);
 extern void	ipiput4(Fs*, Ipifc*, Block*);
 extern void	ipiput6(Fs*, Ipifc*, Block*);
--- a/sys/src/9/ip/ipaux.c
+++ b/sys/src/9/ip/ipaux.c
@@ -203,7 +203,6 @@
 	smcast[15] = a[15];
 }
 
-
 /*
  *  parse a hex mac address
  */
@@ -233,9 +232,36 @@
 }
 
 /*
+ *  return multicast version if any
+ */
+int
+ipismulticast(uchar *ip)
+{
+	if(isv4(ip)){
+		if(isv4mcast(&ip[IPv4off]))
+			return V4;
+	}
+	else if(isv6mcast(ip))
+		return V6;
+	return 0;
+}
+
+/*
+ *  return ip version of a connection
+ */
+int
+convipvers(Conv *c)
+{
+	if(isv4(c->raddr) && isv4(c->laddr) || ipcmp(c->raddr, IPnoaddr) == 0)
+		return V4;
+	else
+		return V6;
+}
+
+/*
  *  hashing tcp, udp, ... connections
  */
-ulong
+static ulong
 iphash(uchar *sa, ushort sp, uchar *da, ushort dp)
 {
 	return ((sa[IPaddrlen-1]<<24) ^ (sp << 16) ^ (da[IPaddrlen-1]<<8) ^ dp ) % Nipht;
@@ -242,136 +268,302 @@
 }
 
 void
-iphtadd(Ipht *ht, Conv *c)
+iphtadd(Ipht *ht, Iphash *h)
 {
 	ulong hv;
-	Iphash *h;
 
-	hv = iphash(c->raddr, c->rport, c->laddr, c->lport);
-	h = smalloc(sizeof(*h));
-	if(ipcmp(c->raddr, IPnoaddr) != 0)
+	if(ipcmp(h->raddr, IPnoaddr) != 0)
 		h->match = IPmatchexact;
 	else {
-		if(ipcmp(c->laddr, IPnoaddr) != 0){
-			if(c->lport == 0)
+		if(ipcmp(h->laddr, IPnoaddr) != 0){
+			if(h->lport == 0)
 				h->match = IPmatchaddr;
 			else
 				h->match = IPmatchpa;
 		} else {
-			if(c->lport == 0)
+			if(h->lport == 0)
 				h->match = IPmatchany;
 			else
 				h->match = IPmatchport;
 		}
 	}
-	h->c = c;
-
 	lock(ht);
-	h->next = ht->tab[hv];
+	hv = iphash(h->raddr, h->rport, h->laddr, h->lport);
+	h->nextiphash = ht->tab[hv];
 	ht->tab[hv] = h;
 	unlock(ht);
 }
 
 void
-iphtrem(Ipht *ht, Conv *c)
+iphtrem(Ipht *ht, Iphash *h)
 {
 	ulong hv;
-	Iphash **l, *h;
+	Iphash **l;
 
-	hv = iphash(c->raddr, c->rport, c->laddr, c->lport);
 	lock(ht);
-	for(l = &ht->tab[hv]; (*l) != nil; l = &(*l)->next)
-		if((*l)->c == c){
-			h = *l;
-			(*l) = h->next;
-			free(h);
+	hv = iphash(h->raddr, h->rport, h->laddr, h->lport);
+	for(l = &ht->tab[hv]; (*l) != nil; l = &(*l)->nextiphash)
+		if(*l == h){
+			(*l) = h->nextiphash;
+			h->nextiphash = nil;
 			break;
 		}
 	unlock(ht);
 }
 
-/* look for a matching conversation with the following precedence
- *	connected && raddr,rport,laddr,lport
- *	announced && laddr,lport
- *	announced && *,lport
- *	announced && laddr,*
- *	announced && *,*
+/* look for a matching iphash with the following precedence
+ *	raddr,rport,laddr,lport
+ *	laddr,lport
+ *	*,lport
+ *	laddr,*
+ *	*,*
  */
-Conv*
+Iphash*
 iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp)
 {
 	ulong hv;
 	Iphash *h;
-	Conv *c;
 
+	lock(ht);
 	/* exact 4 pair match (connection) */
 	hv = iphash(sa, sp, da, dp);
-	lock(ht);
-	for(h = ht->tab[hv]; h != nil; h = h->next){
+	for(h = ht->tab[hv]; h != nil; h = h->nextiphash){
 		if(h->match != IPmatchexact)
 			continue;
-		c = h->c;
-		if(sp == c->rport && dp == c->lport
-		&& ipcmp(sa, c->raddr) == 0 && ipcmp(da, c->laddr) == 0){
+		if(sp == h->rport && dp == h->lport
+		&& ipcmp(sa, h->raddr) == 0 && ipcmp(da, h->laddr) == 0){
 			unlock(ht);
-			return c;
+			return h;
 		}
 	}
 
 	/* match local address and port */
 	hv = iphash(IPnoaddr, 0, da, dp);
-	for(h = ht->tab[hv]; h != nil; h = h->next){
+	for(h = ht->tab[hv]; h != nil; h = h->nextiphash){
 		if(h->match != IPmatchpa)
 			continue;
-		c = h->c;
-		if(dp == c->lport && ipcmp(da, c->laddr) == 0){
+		if(dp == h->lport && ipcmp(da, h->laddr) == 0){
 			unlock(ht);
-			return c;
+			return h;
 		}
 	}
 
 	/* match just port */
 	hv = iphash(IPnoaddr, 0, IPnoaddr, dp);
-	for(h = ht->tab[hv]; h != nil; h = h->next){
+	for(h = ht->tab[hv]; h != nil; h = h->nextiphash){
 		if(h->match != IPmatchport)
 			continue;
-		c = h->c;
-		if(dp == c->lport){
+		if(dp == h->lport){
 			unlock(ht);
-			return c;
+			return h;
 		}
 	}
 
 	/* match local address */
 	hv = iphash(IPnoaddr, 0, da, 0);
-	for(h = ht->tab[hv]; h != nil; h = h->next){
+	for(h = ht->tab[hv]; h != nil; h = h->nextiphash){
 		if(h->match != IPmatchaddr)
 			continue;
-		c = h->c;
-		if(ipcmp(da, c->laddr) == 0){
+		if(ipcmp(da, h->laddr) == 0){
 			unlock(ht);
-			return c;
+			return h;
 		}
 	}
 
 	/* look for something that matches anything */
 	hv = iphash(IPnoaddr, 0, IPnoaddr, 0);
-	for(h = ht->tab[hv]; h != nil; h = h->next){
+	for(h = ht->tab[hv]; h != nil; h = h->nextiphash){
 		if(h->match != IPmatchany)
 			continue;
-		c = h->c;
 		unlock(ht);
-		return c;
+		return h;
 	}
 	unlock(ht);
 	return nil;
 }
 
-int
-convipvers(Conv *c)
+/*
+ * Move entry to front of Proto.translations
+ * and update the timestamp.
+ *
+ * Proto is locked.
+ */
+static Translation*
+transupdate(Proto *p, Translation *q)
 {
-	if(isv4(c->raddr) && isv4(c->laddr) || ipcmp(c->raddr, IPnoaddr) == 0)
-		return V4;
+	q->time = NOW;
+
+	/* unlink */
+	if(q->link != nil && (*q->link = q->next) != nil)
+		q->next->link = q->link;
+
+	/* link to front */
+	if((q->next = p->translations) != nil)
+		q->next->link = &q->next;
+	p->translations = q;
+	q->link = &p->translations;
+
+	return q;
+}
+
+/*
+ * Called with the 4-tuple (sa,sp,da,dp)
+ * that should be source translated,
+ * returning the translation.
+ *
+ * Proto is locked.
+ */
+Translation*
+transforward(Proto *p, Ipht *ht, uchar *sa, int sp, uchar *da, int dp, Route *r)
+{
+	uchar ia[IPaddrlen];
+	Routehint rh;
+	Translation *q;
+	Iphash *iph;
+	Ipifc *ifc;
+	int lport;
+	ulong now;
+	int num;
+
+	/* Translation already exists? */
+	iph = iphtlook(ht, sa, sp, da, dp);
+	if(iph != nil) {
+		if(iph->trans != 1)
+			return nil;
+		return transupdate(p, iphforward(iph));
+	}
+
+	/* Bad source address? */
+	if(ipismulticast(sa) || ipforme(p->f, sa) != 0){
+		netlog(p->f, Logtrans, "trans: bad source address: %s!%I!%d -> %I!%d\n",
+			p->name, sa, sp, da, dp);
+		return nil;
+	}
+
+	/* Bad forward route? */
+	if(r == nil || (ifc = r->ifc) == nil){
+		netlog(p->f, Logtrans, "trans: no forward route: %s!%I!%d -> %I!%d\n",
+			p->name, sa, sp, da, dp);
+		return nil;
+	}
+
+	/* Find a source address on the destination interface */
+	rlock(ifc);
+	memmove(ia, v4prefix, IPv4off);
+	if(!ipv4local(ifc, ia+IPv4off, 0, (r->type & (Rifc|Runi|Rbcast|Rmulti))? da+IPv4off: r->v4.gate)){
+		runlock(ifc);
+		netlog(p->f, Logtrans, "trans: no source ip: %s!%I!%d -> %I!%d\n",
+			p->name, sa, sp, da, dp);
+		return nil;
+	}
+	runlock(ifc);
+
+	/* Check backward route */
+	rh.a = nil;
+	rh.r = nil;
+	if(ipismulticast(da))
+		r = v4lookup(p->f, sa+IPv4off, ia+IPv4off, nil);
 	else
-		return V6;
+		r = v4lookup(p->f, sa+IPv4off, da+IPv4off, &rh);
+	if(r == nil || (r->ifc == ifc && !ifc->reflect)){
+		netlog(p->f, Logtrans, "trans: bad backward route: %s!%I!%d <- %I <- %I!%d\n",
+			p->name, sa, sp, ia, da, dp);
+		return nil;
+	}
+
+	/* Find local port */
+	lport = unusedlport(p);
+	if(lport <= 0){
+		netlog(p->f, Logtrans, "trans: no local port: %s!%I!%d <- %I <- %I!%d\n",
+			p->name, sa, sp, ia, da, dp);
+		return nil;
+	}
+
+	/* Reuse expired entries */
+	num = 0;
+	now = NOW;
+	for(q = p->translations; q != nil; q = q->next) {
+		if(++num >= 1000 || (now - q->time) >= 5*60*1000){
+			netlog(p->f, Logtrans, "trans: removing %s!%I!%d -> %I!%d -> %I!%d\n",
+				p->name,
+				q->forward.raddr, q->forward.rport,
+				q->backward.laddr, q->backward.lport,
+				q->forward.laddr, q->forward.lport);
+
+			iphtrem(ht, &q->forward);
+			iphtrem(ht, &q->backward);
+			break;
+		}
+	}
+	if(q == nil){
+		q = malloc(sizeof(*q));
+		if(q == nil)
+			return nil;
+		q->link = nil;
+	}
+
+	/* Match what needs to be forwarded */
+	q->forward.trans = 1;
+	q->forward.lport = dp;
+	q->forward.rport = sp;
+	ipmove(q->forward.laddr, da);
+	ipmove(q->forward.raddr, sa);
+
+	/* Match what comes back to us */
+	q->backward.trans = 2;
+	q->backward.lport = lport;
+	ipmove(q->backward.laddr, ia);
+	if(p->ipproto == 1 || ipismulticast(da)){
+		q->backward.rport = 0;
+		ipmove(q->backward.raddr, IPnoaddr);
+	} else {
+		q->backward.rport = dp;
+		ipmove(q->backward.raddr, da);
+	}
+	memmove(&q->Routehint, &rh, sizeof(rh));
+
+	netlog(p->f, Logtrans, "trans: adding %s!%I!%d -> %I!%d -> %I!%d\n",
+		p->name,
+		q->forward.raddr, q->forward.rport,
+		q->backward.laddr, q->backward.lport,
+		q->forward.laddr, q->forward.lport);
+
+	iphtadd(ht, &q->forward);
+	iphtadd(ht, &q->backward);
+
+	return transupdate(p, q);
+}
+
+/*
+ * Check if backward translation is valid and
+ * update timestamp.
+ *
+ * Proto is locked.
+ */
+Translation*
+transbackward(Proto *p, Iphash *iph)
+{
+	if(iph == nil || iph->trans != 2)
+		return nil;
+
+	return transupdate(p, iphbackward(iph));
+}
+
+/*
+ * Checksum adjusting hnputs()
+ */
+void
+hnputs_csum(void *p, ushort v, uchar *pcsum)
+{
+	ulong csum;
+
+	assert((((uchar*)p - pcsum) & 1) == 0);
+
+	csum = nhgets(pcsum)^0xFFFF;
+	csum += nhgets(p)^0xFFFF;
+	csum += v;
+	hnputs(p, v);
+	while(v = csum >> 16)
+		csum = (csum & 0xFFFF) + v;
+	hnputs(pcsum, csum^0xFFFF);
 }
--- a/sys/src/9/ip/ipifc.c
+++ b/sys/src/9/ip/ipifc.c
@@ -1436,21 +1436,6 @@
 }
 
 /*
- *  return multicast version if any
- */
-int
-ipismulticast(uchar *ip)
-{
-	if(isv4(ip)){
-		if(ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0)
-			return V4;
-	}
-	else if(ip[0] == 0xff)
-		return V6;
-	return 0;
-}
-
-/*
  *  add a multicast address to an interface.
  */
 void
--- a/sys/src/9/ip/iproute.c
+++ b/sys/src/9/ip/iproute.c
@@ -875,6 +875,9 @@
 	case 'p':
 		if(((type ^= Rptpt) & Rptpt) != Rptpt) return -1;
 		break;
+	case 't':
+		if(((type ^= Rtrans) & Rtrans) != Rtrans) return -1;
+		break;
 	case '\0':
 		return type;
 	}
@@ -900,6 +903,10 @@
 
 	if(type & Rptpt)
 		*p++ = 'p';
+
+	if(type & Rtrans)
+		*p++ = 't';
+
 	*p = 0;
 }
 
--- a/sys/src/9/ip/ipv6.c
+++ b/sys/src/9/ip/ipv6.c
@@ -278,7 +278,7 @@
 
 		/* don't forward if packet has timed out */
 		hop = h->ttl;
-		if(hop < 1) {
+		if(hop <= 1) {
 			ip->stats[InHdrErrors]++;
 			icmpttlexceeded6(f, ifc, bp);
 			goto drop;
@@ -292,8 +292,7 @@
 		ip->stats[ForwDatagrams]++;
 		h = (Ip6hdr*)bp->rp;
 		tos = (h->vcf[0]&0x0F)<<2 | (h->vcf[1]&0xF0)>>2;
-		hop = h->ttl;
-		ipoput6(f, bp, 1, hop-1, tos, &rh);
+		ipoput6(f, bp, 1, hop - 1, tos, &rh);
 		return;
 	}
 
--- a/sys/src/9/ip/netlog.c
+++ b/sys/src/9/ip/netlog.c
@@ -51,6 +51,7 @@
 	{ "udpmsg",	Logudp|Logudpmsg, },
 	{ "ipmsg",	Logip|Logipmsg, },
 	{ "esp",	Logesp, },
+	{ "trans",	Logtrans, },
 	{ nil,		0, },
 };
 
--- a/sys/src/9/ip/rudp.c
+++ b/sys/src/9/ip/rudp.c
@@ -220,9 +220,10 @@
 	rudpstartackproc(c->p);
 	e = Fsstdconnect(c, argv, argc);
 	Fsconnected(c, e);
+	if(e != nil)
+		return e;
 	iphtadd(&upriv->ht, c);
-
-	return e;
+	return nil;
 }
 
 
@@ -256,7 +257,6 @@
 		return e;
 	Fsconnected(c, nil);
 	iphtadd(&upriv->ht, c);
-
 	return nil;
 }
 
@@ -289,10 +289,11 @@
 	qclose(c->rq);
 	qclose(c->wq);
 	qclose(c->eq);
-	ipmove(c->laddr, IPnoaddr);
-	ipmove(c->raddr, IPnoaddr);
+
 	c->lport = 0;
+	ipmove(c->laddr, IPnoaddr);
 	c->rport = 0;
+	ipmove(c->raddr, IPnoaddr);
 
 	ucb->headers = 0;
 	ucb->randdrop = 0;
@@ -460,11 +461,12 @@
 void
 rudpiput(Proto *rudp, Ipifc *ifc, Block *bp)
 {
-	int len, olen, ottl;
+	int len, olen;
 	Udphdr *uh;
+	Iphash *iph;
 	Conv *c;
 	Rudpcb *ucb;
-	uchar raddr[IPaddrlen], laddr[IPaddrlen];
+	uchar raddr[IPaddrlen], laddr[IPaddrlen], ottl;
 	ushort rport, lport;
 	Rudppriv *upriv;
 	Fs *f;
@@ -503,9 +505,8 @@
 	}
 
 	qlock(rudp);
-
-	c = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
-	if(c == nil){
+	iph = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
+	if(iph == nil){
 		/* no conversation found */
 		upriv->ustats.rudpNoPorts++;
 		qunlock(rudp);
@@ -517,6 +518,7 @@
 		freeblist(bp);
 		return;
 	}
+	c = iphconv(iph);
 	ucb = (Rudpcb*)c->ptcl;
 	qlock(ucb);
 	qunlock(rudp);
--- a/sys/src/9/ip/tcp.c
+++ b/sys/src/9/ip/tcp.c
@@ -126,7 +126,7 @@
 	uchar	length[2];	/* packet length */
 	uchar	id[2];		/* Identification */
 	uchar	frag[2];	/* Fragment information */
-	uchar	Unused;
+	uchar	ttl;
 	uchar	proto;
 	uchar	tcplen[2];
 	uchar	tcpsrc[4];
@@ -1814,9 +1814,7 @@
 	}
 
 	tcpsetstate(new, Established);
-
 	iphtadd(&tpriv->ht, new);
-
 	return new;
 }
 
@@ -2068,10 +2066,11 @@
 	Tcp seg;
 	Tcp4hdr *h4;
 	Tcp6hdr *h6;
-	int hdrlen;
 	Tcpctl *tcb;
-	ushort length, csum;
+	int hdrlen;
+	ushort length;
 	uchar source[IPaddrlen], dest[IPaddrlen];
+	Iphash *iph;
 	Conv *s;
 	Fs *f;
 	Tcppriv *tpriv;
@@ -2087,15 +2086,25 @@
 	h6 = (Tcp6hdr*)(bp->rp);
 
 	if((h4->vihl&0xF0)==IP_VER4) {
+		int ttl = h4->ttl;
+
 		version = V4;
 		length = nhgets(h4->length);
+		if(length < TCP4_PKT){
+			tpriv->stats[HlenErrs]++;
+			tpriv->stats[InErrs]++;
+			netlog(f, Logtcp, "bad tcp len\n");
+			freeblist(bp);
+			return;
+		}
+		length -= TCP4_PKT;
 		v4tov6(dest, h4->tcpdst);
 		v4tov6(source, h4->tcpsrc);
 
-		h4->Unused = 0;
-		hnputs(h4->tcplen, length-TCP4_PKT);
-		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
-			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
+		h4->ttl = 0;
+		hnputs(h4->tcplen, length);
+		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1])
+		&& ptclcsum(bp, TCP4_IPLEN, length + TCP4_PKT - TCP4_IPLEN)) {
 			tpriv->stats[CsumErrs]++;
 			tpriv->stats[InErrs]++;
 			netlog(f, Logtcp, "bad tcp proto cksum\n");
@@ -2102,6 +2111,7 @@
 			freeblist(bp);
 			return;
 		}
+		h4->ttl = ttl;
 
 		hdrlen = ntohtcp4(&seg, &bp);
 		if(hdrlen < 0){
@@ -2110,16 +2120,8 @@
 			netlog(f, Logtcp, "bad tcp hdr len\n");
 			return;
 		}
-
-		/* trim the packet to the size claimed by the datagram */
-		length -= hdrlen+TCP4_PKT;
-		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
-		if(bp == nil){
-			tpriv->stats[LenErrs]++;
-			tpriv->stats[InErrs]++;
-			netlog(f, Logtcp, "tcp len < 0 after trim\n");
-			return;
-		}
+		length -= hdrlen;
+		hdrlen += TCP4_PKT;
 	}
 	else {
 		int ttl = h6->ttl;
@@ -2133,13 +2135,13 @@
 		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
 		h6->ttl = proto;
 		hnputl(h6->vcf, length);
-		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
-		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
+		if((h6->tcpcksum[0] || h6->tcpcksum[1])
+		&& ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE) != 0) {
 			tpriv->stats[CsumErrs]++;
 			tpriv->stats[InErrs]++;
 			netlog(f, Logtcp,
-			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
-				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
+			    "bad tcpv6 proto cksum: got %#ux\n",
+				h6->tcpcksum[0]<<8 | h6->tcpcksum[1]);
 			freeblist(bp);
 			return;
 		}
@@ -2154,16 +2156,8 @@
 			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
 			return;
 		}
-
-		/* trim the packet to the size claimed by the datagram */
 		length -= hdrlen;
-		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
-		if(bp == nil){
-			tpriv->stats[LenErrs]++;
-			tpriv->stats[InErrs]++;
-			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
-			return;
-		}
+		hdrlen += TCP6_PKT;
 	}
 
 	/* lock protocol while searching for a conversation */
@@ -2170,8 +2164,8 @@
 	qlock(tcp);
 
 	/* Look for a matching conversation */
-	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
-	if(s == nil){
+	iph = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
+	if(iph == nil){
 		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
 			source, seg.source, dest, seg.dest);
 reset:
@@ -2180,7 +2174,31 @@
 		sndrst(tcp, source, dest, length, &seg, version, "no conversation", nil);
 		return;
 	}
+	if(iph->trans){
+		Translation *q;
+		int hop = h4->ttl;
 
+		if(hop <= 1 || (q = transbackward(tcp, iph)) == nil)
+			goto reset;
+		hnputs_csum(h4->tcpdst+0, nhgets(q->forward.raddr+IPv4off+0), h4->tcpcksum);
+		hnputs_csum(h4->tcpdst+2, nhgets(q->forward.raddr+IPv4off+2), h4->tcpcksum);
+		hnputs_csum(h4->tcpdport, q->forward.rport, h4->tcpcksum);
+		qunlock(tcp);
+		ipoput4(f, bp, 1, hop - 1, h4->tos, q);
+		return;
+	}
+	s = iphconv(iph);
+
+	/* trim off ip and tcp headers */
+	bp = trimblock(bp, hdrlen, length);
+	if(bp == nil){
+		tpriv->stats[LenErrs]++;
+		tpriv->stats[InErrs]++;
+		netlog(f, Logtcp, "tcp bad length after header trim off\n");
+		qunlock(tcp);
+		return;
+	}
+
 	/* if it's a listener, look for the right flags and get a new conv */
 	tcb = (Tcpctl*)s->ptcl;
 	if(tcb->state == Listen){
@@ -3200,11 +3218,12 @@
 {
 	Tcp4hdr *h4;
 	Tcp6hdr *h6;
-	Tcpctl *tcb;
 	uchar source[IPaddrlen];
 	uchar dest[IPaddrlen];
 	ushort psource, pdest;
-	Conv *s, **p;
+	Iphash *iph;
+	Tcpctl *tcb;
+	Conv *s;
 
 	h4 = (Tcp4hdr*)(bp->rp);
 	h6 = (Tcp6hdr*)(bp->rp);
@@ -3221,33 +3240,73 @@
 		pdest = nhgets(h6->tcpdport);
 	}
 
-	/* Look for a connection */
+	/* Look for a connection (source/dest reversed; this is the original packet we sent) */
 	qlock(tcp);
-	for(p = tcp->conv; (s = *p) != nil; p++) {
-		tcb = (Tcpctl*)s->ptcl;
-		if(s->rport == pdest)
-		if(s->lport == psource)
-		if(tcb->state != Closed)
-		if(ipcmp(s->raddr, dest) == 0)
-		if(ipcmp(s->laddr, source) == 0){
-			if(s->ignoreadvice)
-				break;
-			qlock(s);
-			qunlock(tcp);
-			switch(tcb->state){
-			case Syn_sent:
-				localclose(s, msg);
-				break;
-			}
-			qunlock(s);
-			freeblist(bp);
-			return;
-		}
+	iph = iphtlook(&((Tcppriv*)tcp->priv)->ht, dest, pdest, source, psource);
+	if(iph == nil)
+		goto raise;
+	if(iph->trans){
+		Translation *q;
+
+		if((q = transbackward(tcp, iph)) == nil)
+			goto raise;
+
+		/* h4->tcplen is the ip header checksum */
+		hnputs_csum(h4->tcpsrc+0, nhgets(q->forward.raddr+IPv4off+0), h4->tcplen);
+		hnputs_csum(h4->tcpsrc+2, nhgets(q->forward.raddr+IPv4off+2), h4->tcplen);
+
+		/* dont bother fixing tcp checksum, packet is most likely truncated */
+		hnputs(h4->tcpsport, q->forward.rport);
+		qunlock(tcp);
+
+		icmpproxyadvice(tcp->f, bp, h4->tcpsrc);
+		return;
 	}
+	s = iphconv(iph);
+	if(s->ignoreadvice || s->state == Closed)
+		goto raise;
+	qlock(s);
 	qunlock(tcp);
+	tcb = (Tcpctl*)s->ptcl;
+	if(tcb->state == Syn_sent)
+		localclose(s, msg);
+	qunlock(s);
 	freeblist(bp);
+	return;
+raise:
+	qunlock(tcp);
+	freeblist(bp);
 }
 
+static Block*
+tcpforward(Proto *tcp, Block *bp, Route *r)
+{
+	uchar da[IPaddrlen], sa[IPaddrlen];
+	ushort dp, sp;
+	Tcp4hdr *h4;
+	Translation *q;
+
+	h4 = (Tcp4hdr*)(bp->rp);
+	v4tov6(da, h4->tcpdst);
+	v4tov6(sa, h4->tcpsrc);
+	dp = nhgets(h4->tcpdport);
+	sp = nhgets(h4->tcpsport);
+
+	qlock(tcp);
+	q = transforward(tcp, &((Tcppriv*)tcp->priv)->ht, sa, sp, da, dp, r);
+	if(q == nil){
+		qunlock(tcp);
+		freeblist(bp);
+		return nil;
+	}
+	hnputs_csum(h4->tcpsrc+0, nhgets(q->backward.laddr+IPv4off+0), h4->tcpcksum);
+	hnputs_csum(h4->tcpsrc+2, nhgets(q->backward.laddr+IPv4off+2), h4->tcpcksum);
+	hnputs_csum(h4->tcpsport, q->backward.lport, h4->tcpcksum);
+	qunlock(tcp);
+
+	return bp;
+}
+
 static char*
 tcpporthogdefensectl(char *val)
 {
@@ -3371,6 +3430,7 @@
 	tcp->close = tcpclose;
 	tcp->rcv = tcpiput;
 	tcp->advise = tcpadvise;
+	tcp->forward = tcpforward;
 	tcp->stats = tcpstats;
 	tcp->inuse = tcpinuse;
 	tcp->gc = tcpgc;
--- a/sys/src/9/ip/udp.c
+++ b/sys/src/9/ip/udp.c
@@ -39,7 +39,7 @@
 	uchar	length[2];	/* packet length */
 	uchar	id[2];		/* Identification */
 	uchar	frag[2];	/* Fragment information */
-	uchar	Unused;
+	uchar	ttl;      	/* Time to live */
 	uchar	udpproto;	/* Protocol */
 	uchar	udpplen[2];	/* Header plus data length */
 	uchar	udpsrc[IPv4addrlen];	/* Ip source */
@@ -91,7 +91,6 @@
 	ulong		lenerr;			/* short packet */
 };
 
-void (*etherprofiler)(char *name, int qlen);
 void udpkick(void *x, Block *bp);
 
 /*
@@ -114,7 +113,6 @@
 	Fsconnected(c, e);
 	if(e != nil)
 		return e;
-
 	iphtadd(&upriv->ht, c);
 	return nil;
 }
@@ -142,7 +140,6 @@
 		return e;
 	Fsconnected(c, nil);
 	iphtadd(&upriv->ht, c);
-
 	return nil;
 }
 
@@ -166,10 +163,10 @@
 	qclose(c->rq);
 	qclose(c->wq);
 	qclose(c->eq);
-	ipmove(c->laddr, IPnoaddr);
-	ipmove(c->raddr, IPnoaddr);
 	c->lport = 0;
+	ipmove(c->laddr, IPnoaddr);
 	c->rport = 0;
+	ipmove(c->raddr, IPnoaddr);
 
 	ucb = (Udpcb*)c->ptcl;
 	ucb->headers = 0;
@@ -238,7 +235,7 @@
 		bp = padblock(bp, UDP4_IPHDR_SZ+UDP_UDPHDR_SZ);
 		uh4 = (Udp4hdr *)(bp->rp);
 		ptcllen = dlen + UDP_UDPHDR_SZ;
-		uh4->Unused = 0;
+		uh4->ttl = 0;
 		uh4->udpproto = IP_UDPPROTO;
 		uh4->frag[0] = 0;
 		uh4->frag[1] = 0;
@@ -319,6 +316,7 @@
 	int len;
 	Udp4hdr *uh4;
 	Udp6hdr *uh6;
+	Iphash *iph;
 	Conv *c;
 	Udpcb *ucb;
 	uchar raddr[IPaddrlen], laddr[IPaddrlen];
@@ -334,6 +332,7 @@
 	upriv->ustats.udpInDatagrams++;
 
 	uh4 = (Udp4hdr*)(bp->rp);
+	uh6 = (Udp6hdr*)(bp->rp);
 	version = ((uh4->vihl&0xF0)==IP_VER6) ? V6 : V4;
 
 	/* Put back pseudo header for checksum
@@ -340,8 +339,8 @@
 	 * (remember old values for icmpnoconv()) */
 	switch(version) {
 	case V4:
-		ottl = uh4->Unused;
-		uh4->Unused = 0;
+		ottl = uh4->ttl;
+		uh4->ttl = 0;
 		len = nhgets(uh4->udplen);
 		olen = nhgets(uh4->udpplen);
 		hnputs(uh4->udpplen, len);
@@ -360,11 +359,10 @@
 				return;
 			}
 		}
-		uh4->Unused = ottl;
+		uh4->ttl = ottl;
 		hnputs(uh4->udpplen, olen);
 		break;
 	case V6:
-		uh6 = (Udp6hdr*)(bp->rp);
 		len = nhgets(uh6->udplen);
 		oviclfl = nhgetl(uh6->viclfl);
 		olen = nhgets(uh6->len);
@@ -394,9 +392,8 @@
 	}
 
 	qlock(udp);
-
-	c = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
-	if(c == nil){
+	iph = iphtlook(&upriv->ht, raddr, rport, laddr, lport);
+	if(iph == nil){
 		/* no conversation found */
 		upriv->ustats.udpNoPorts++;
 		qunlock(udp);
@@ -417,6 +414,26 @@
 		freeblist(bp);
 		return;
 	}
+	if(iph->trans){
+		Translation *q;
+		int hop = uh4->ttl;
+		if(hop <= 1 || (q = transbackward(udp, iph)) == nil){
+			qunlock(udp);
+			freeblist(bp);
+			return;
+		}
+		hnputs_csum(uh4->udpdst+0, nhgets(q->forward.raddr+IPv4off+0), uh4->udpcksum);
+		hnputs_csum(uh4->udpdst+2, nhgets(q->forward.raddr+IPv4off+2), uh4->udpcksum);
+		hnputs_csum(uh4->udpdport, q->forward.rport, uh4->udpcksum);
+
+		/* only use route-hint when from original desination */
+		if(memcmp(uh4->udpsrc, q->forward.laddr+IPv4off, IPv4addrlen) != 0)
+			q = nil;
+		qunlock(udp);
+		ipoput4(f, bp, 1, hop - 1, uh4->tos, q);
+		return;
+	}
+	c = iphconv(iph);
 	ucb = (Udpcb*)c->ptcl;
 
 	if(c->state == Announced){
@@ -487,7 +504,6 @@
 		qpass(c->rq, concatblock(bp));
 	}
 	qunlock(c);
-
 }
 
 char*
@@ -517,7 +533,8 @@
 	Udp6hdr *h6;
 	uchar source[IPaddrlen], dest[IPaddrlen];
 	ushort psource, pdest;
-	Conv *s, **p;
+	Iphash *iph;
+	Conv *s;
 
 	h4 = (Udp4hdr*)(bp->rp);
 	h6 = (Udp6hdr*)(bp->rp);
@@ -534,28 +551,72 @@
 		pdest = nhgets(h6->udpdport);
 	}
 
-	/* Look for a connection */
+	/* Look for a connection (source/dest reversed; this is the original packet we sent) */
 	qlock(udp);
-	for(p = udp->conv; (s = *p) != nil; p++) {
-		if(s->rport == pdest)
-		if(s->lport == psource)
-		if(ipcmp(s->raddr, dest) == 0)
-		if(ipcmp(s->laddr, source) == 0){
-			if(s->ignoreadvice)
-				break;
-			qlock(s);
-			qunlock(udp);
-			qhangup(s->rq, msg);
-			qhangup(s->wq, msg);
-			qunlock(s);
-			freeblist(bp);
-			return;
-		}
+	iph = iphtlook(&((Udppriv*)udp->priv)->ht, dest, pdest, source, psource);
+	if(iph == nil)
+		goto raise;
+	if(iph->trans){
+		Translation *q;
+
+		if((q = transbackward(udp, iph)) == nil)
+			goto raise;
+
+		/* h4->udpplen is the ip header checksum */
+		hnputs_csum(h4->udpsrc+0, nhgets(q->forward.raddr+IPv4off+0), h4->udpplen);
+		hnputs_csum(h4->udpsrc+2, nhgets(q->forward.raddr+IPv4off+2), h4->udpplen);
+
+		/* dont bother fixing udp checksum, packet is most likely truncated */
+		hnputs(h4->udpsport, q->forward.rport);
+		qunlock(udp);
+
+		icmpproxyadvice(udp->f, bp, h4->udpsrc);
+		return;
 	}
+	s = iphconv(iph);
+	if(s->ignoreadvice)
+		goto raise;
+	qlock(s);
 	qunlock(udp);
+	qhangup(s->rq, msg);
+	qhangup(s->wq, msg);
+	qunlock(s);
 	freeblist(bp);
+	return;
+raise:
+	qunlock(udp);
+	freeblist(bp);
 }
 
+Block*
+udpforward(Proto *udp, Block *bp, Route *r)
+{
+	uchar da[IPaddrlen], sa[IPaddrlen];
+	ushort dp, sp;
+	Udp4hdr *uh4;
+	Translation *q;
+
+	uh4 = (Udp4hdr*)(bp->rp);
+	v4tov6(sa, uh4->udpsrc);
+	v4tov6(da, uh4->udpdst);
+	dp = nhgets(uh4->udpdport);
+	sp = nhgets(uh4->udpsport);
+
+	qlock(udp);
+	q = transforward(udp, &((Udppriv*)udp->priv)->ht, sa, sp, da, dp, r);
+	if(q == nil){
+		qunlock(udp);
+		freeblist(bp);
+		return nil;
+	}
+	hnputs_csum(uh4->udpsrc+0, nhgets(q->backward.laddr+IPv4off+0), uh4->udpcksum);
+	hnputs_csum(uh4->udpsrc+2, nhgets(q->backward.laddr+IPv4off+2), uh4->udpcksum);
+	hnputs_csum(uh4->udpsport, q->backward.lport, uh4->udpcksum);
+	qunlock(udp);
+
+	return bp;
+}
+
 int
 udpstats(Proto *udp, char *buf, int len)
 {
@@ -586,6 +647,7 @@
 	udp->close = udpclose;
 	udp->rcv = udpiput;
 	udp->advise = udpadvise;
+	udp->forward = udpforward;
 	udp->stats = udpstats;
 	udp->ipproto = IP_UDPPROTO;
 	udp->nc = Nchans;