1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
9 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
10 *		Tom May, <ftom@netcom.com>
11 *              Pentium Pro/II routines:
12 *              Alexander Kjeldaas <astor@guardian.no>
13 *              Finn Arne Gangstad <finnag@guardian.no>
14 *		Lots of code moved from tcp.c and ip.c; see those files
15 *		for more names.
16 *
17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
18 *			     handling.
19 *		Andi Kleen,  add zeroing on error
20 *                   converted to pure assembler
21 *
22 *		This program is free software; you can redistribute it and/or
23 *		modify it under the terms of the GNU General Public License
24 *		as published by the Free Software Foundation; either version
25 *		2 of the License, or (at your option) any later version.
26 */
27
28#include <linux/linkage.h>
29#include <asm/dwarf2.h>
30#include <asm/errno.h>
31
32/*
33 * computes a partial checksum, e.g. for TCP/UDP fragments
34 */
35
36/*
37unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
38 */
39
40.text
41
42#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
43
44	  /*
45	   * Experiments with Ethernet and SLIP connections show that buff
46	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
47	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
48	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
49	   * alignment for the unrolled loop.
50	   */
51ENTRY(csum_partial)
52	CFI_STARTPROC
53	pushl_cfi %esi
54	CFI_REL_OFFSET esi, 0
55	pushl_cfi %ebx
56	CFI_REL_OFFSET ebx, 0
57	movl 20(%esp),%eax	# Function arg: unsigned int sum
58	movl 16(%esp),%ecx	# Function arg: int len
59	movl 12(%esp),%esi	# Function arg: unsigned char *buff
60	testl $3, %esi		# Check alignment.
61	jz 2f			# Jump if alignment is ok.
62	testl $1, %esi		# Check alignment.
63	jz 10f			# Jump if alignment is boundary of 2bytes.
64
65	# buf is odd
66	dec %ecx
67	jl 8f
68	movzbl (%esi), %ebx
69	adcl %ebx, %eax
70	roll $8, %eax
71	inc %esi
72	testl $2, %esi
73	jz 2f
7410:
75	subl $2, %ecx		# Alignment uses up two bytes.
76	jae 1f			# Jump if we had at least two bytes.
77	addl $2, %ecx		# ecx was < 2.  Deal with it.
78	jmp 4f
791:	movw (%esi), %bx
80	addl $2, %esi
81	addw %bx, %ax
82	adcl $0, %eax
832:
84	movl %ecx, %edx
85	shrl $5, %ecx
86	jz 2f
87	testl %esi, %esi
881:	movl (%esi), %ebx
89	adcl %ebx, %eax
90	movl 4(%esi), %ebx
91	adcl %ebx, %eax
92	movl 8(%esi), %ebx
93	adcl %ebx, %eax
94	movl 12(%esi), %ebx
95	adcl %ebx, %eax
96	movl 16(%esi), %ebx
97	adcl %ebx, %eax
98	movl 20(%esi), %ebx
99	adcl %ebx, %eax
100	movl 24(%esi), %ebx
101	adcl %ebx, %eax
102	movl 28(%esi), %ebx
103	adcl %ebx, %eax
104	lea 32(%esi), %esi
105	dec %ecx
106	jne 1b
107	adcl $0, %eax
1082:	movl %edx, %ecx
109	andl $0x1c, %edx
110	je 4f
111	shrl $2, %edx		# This clears CF
1123:	adcl (%esi), %eax
113	lea 4(%esi), %esi
114	dec %edx
115	jne 3b
116	adcl $0, %eax
1174:	andl $3, %ecx
118	jz 7f
119	cmpl $2, %ecx
120	jb 5f
121	movw (%esi),%cx
122	leal 2(%esi),%esi
123	je 6f
124	shll $16,%ecx
1255:	movb (%esi),%cl
1266:	addl %ecx,%eax
127	adcl $0, %eax
1287:
129	testl $1, 12(%esp)
130	jz 8f
131	roll $8, %eax
1328:
133	popl_cfi %ebx
134	CFI_RESTORE ebx
135	popl_cfi %esi
136	CFI_RESTORE esi
137	ret
138	CFI_ENDPROC
139ENDPROC(csum_partial)
140
141#else
142
143/* Version for PentiumII/PPro */
144
145ENTRY(csum_partial)
146	CFI_STARTPROC
147	pushl_cfi %esi
148	CFI_REL_OFFSET esi, 0
149	pushl_cfi %ebx
150	CFI_REL_OFFSET ebx, 0
151	movl 20(%esp),%eax	# Function arg: unsigned int sum
152	movl 16(%esp),%ecx	# Function arg: int len
153	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
154
155	testl $3, %esi
156	jnz 25f
15710:
158	movl %ecx, %edx
159	movl %ecx, %ebx
160	andl $0x7c, %ebx
161	shrl $7, %ecx
162	addl %ebx,%esi
163	shrl $2, %ebx
164	negl %ebx
165	lea 45f(%ebx,%ebx,2), %ebx
166	testl %esi, %esi
167	jmp *%ebx
168
169	# Handle 2-byte-aligned regions
17020:	addw (%esi), %ax
171	lea 2(%esi), %esi
172	adcl $0, %eax
173	jmp 10b
17425:
175	testl $1, %esi
176	jz 30f
177	# buf is odd
178	dec %ecx
179	jl 90f
180	movzbl (%esi), %ebx
181	addl %ebx, %eax
182	adcl $0, %eax
183	roll $8, %eax
184	inc %esi
185	testl $2, %esi
186	jz 10b
187
18830:	subl $2, %ecx
189	ja 20b
190	je 32f
191	addl $2, %ecx
192	jz 80f
193	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
194	addl %ebx, %eax
195	adcl $0, %eax
196	jmp 80f
19732:
198	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
199	adcl $0, %eax
200	jmp 80f
201
20240:
203	addl -128(%esi), %eax
204	adcl -124(%esi), %eax
205	adcl -120(%esi), %eax
206	adcl -116(%esi), %eax
207	adcl -112(%esi), %eax
208	adcl -108(%esi), %eax
209	adcl -104(%esi), %eax
210	adcl -100(%esi), %eax
211	adcl -96(%esi), %eax
212	adcl -92(%esi), %eax
213	adcl -88(%esi), %eax
214	adcl -84(%esi), %eax
215	adcl -80(%esi), %eax
216	adcl -76(%esi), %eax
217	adcl -72(%esi), %eax
218	adcl -68(%esi), %eax
219	adcl -64(%esi), %eax
220	adcl -60(%esi), %eax
221	adcl -56(%esi), %eax
222	adcl -52(%esi), %eax
223	adcl -48(%esi), %eax
224	adcl -44(%esi), %eax
225	adcl -40(%esi), %eax
226	adcl -36(%esi), %eax
227	adcl -32(%esi), %eax
228	adcl -28(%esi), %eax
229	adcl -24(%esi), %eax
230	adcl -20(%esi), %eax
231	adcl -16(%esi), %eax
232	adcl -12(%esi), %eax
233	adcl -8(%esi), %eax
234	adcl -4(%esi), %eax
23545:
236	lea 128(%esi), %esi
237	adcl $0, %eax
238	dec %ecx
239	jge 40b
240	movl %edx, %ecx
24150:	andl $3, %ecx
242	jz 80f
243
244	# Handle the last 1-3 bytes without jumping
245	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
246	movl $0xffffff,%ebx	# by the shll and shrl instructions
247	shll $3,%ecx
248	shrl %cl,%ebx
249	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
250	addl %ebx,%eax
251	adcl $0,%eax
25280:
253	testl $1, 12(%esp)
254	jz 90f
255	roll $8, %eax
25690:
257	popl_cfi %ebx
258	CFI_RESTORE ebx
259	popl_cfi %esi
260	CFI_RESTORE esi
261	ret
262	CFI_ENDPROC
263ENDPROC(csum_partial)
264
265#endif
266
267/*
268unsigned int csum_partial_copy_generic (const char *src, char *dst,
269				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
270 */
271
272/*
273 * Copy from ds while checksumming, otherwise like csum_partial
274 *
275 * The macros SRC and DST specify the type of access for the instruction.
276 * thus we can call a custom exception handler for all access types.
277 *
278 * FIXME: could someone double-check whether I haven't mixed up some SRC and
279 *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
280 *	  them all but there's no guarantee.
281 */
282
283#define SRC(y...)			\
284	9999: y;			\
285	.section __ex_table, "a";	\
286	.long 9999b, 6001f	;	\
287	.previous
288
289#define DST(y...)			\
290	9999: y;			\
291	.section __ex_table, "a";	\
292	.long 9999b, 6002f	;	\
293	.previous
294
295#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
296
297#define ARGBASE 16
298#define FP		12
299
300ENTRY(csum_partial_copy_generic)
301	CFI_STARTPROC
302	subl  $4,%esp
303	CFI_ADJUST_CFA_OFFSET 4
304	pushl_cfi %edi
305	CFI_REL_OFFSET edi, 0
306	pushl_cfi %esi
307	CFI_REL_OFFSET esi, 0
308	pushl_cfi %ebx
309	CFI_REL_OFFSET ebx, 0
310	movl ARGBASE+16(%esp),%eax	# sum
311	movl ARGBASE+12(%esp),%ecx	# len
312	movl ARGBASE+4(%esp),%esi	# src
313	movl ARGBASE+8(%esp),%edi	# dst
314
315	testl $2, %edi			# Check alignment.
316	jz 2f				# Jump if alignment is ok.
317	subl $2, %ecx			# Alignment uses up two bytes.
318	jae 1f				# Jump if we had at least two bytes.
319	addl $2, %ecx			# ecx was < 2.  Deal with it.
320	jmp 4f
321SRC(1:	movw (%esi), %bx	)
322	addl $2, %esi
323DST(	movw %bx, (%edi)	)
324	addl $2, %edi
325	addw %bx, %ax
326	adcl $0, %eax
3272:
328	movl %ecx, FP(%esp)
329	shrl $5, %ecx
330	jz 2f
331	testl %esi, %esi
332SRC(1:	movl (%esi), %ebx	)
333SRC(	movl 4(%esi), %edx	)
334	adcl %ebx, %eax
335DST(	movl %ebx, (%edi)	)
336	adcl %edx, %eax
337DST(	movl %edx, 4(%edi)	)
338
339SRC(	movl 8(%esi), %ebx	)
340SRC(	movl 12(%esi), %edx	)
341	adcl %ebx, %eax
342DST(	movl %ebx, 8(%edi)	)
343	adcl %edx, %eax
344DST(	movl %edx, 12(%edi)	)
345
346SRC(	movl 16(%esi), %ebx 	)
347SRC(	movl 20(%esi), %edx	)
348	adcl %ebx, %eax
349DST(	movl %ebx, 16(%edi)	)
350	adcl %edx, %eax
351DST(	movl %edx, 20(%edi)	)
352
353SRC(	movl 24(%esi), %ebx	)
354SRC(	movl 28(%esi), %edx	)
355	adcl %ebx, %eax
356DST(	movl %ebx, 24(%edi)	)
357	adcl %edx, %eax
358DST(	movl %edx, 28(%edi)	)
359
360	lea 32(%esi), %esi
361	lea 32(%edi), %edi
362	dec %ecx
363	jne 1b
364	adcl $0, %eax
3652:	movl FP(%esp), %edx
366	movl %edx, %ecx
367	andl $0x1c, %edx
368	je 4f
369	shrl $2, %edx			# This clears CF
370SRC(3:	movl (%esi), %ebx	)
371	adcl %ebx, %eax
372DST(	movl %ebx, (%edi)	)
373	lea 4(%esi), %esi
374	lea 4(%edi), %edi
375	dec %edx
376	jne 3b
377	adcl $0, %eax
3784:	andl $3, %ecx
379	jz 7f
380	cmpl $2, %ecx
381	jb 5f
382SRC(	movw (%esi), %cx	)
383	leal 2(%esi), %esi
384DST(	movw %cx, (%edi)	)
385	leal 2(%edi), %edi
386	je 6f
387	shll $16,%ecx
388SRC(5:	movb (%esi), %cl	)
389DST(	movb %cl, (%edi)	)
3906:	addl %ecx, %eax
391	adcl $0, %eax
3927:
3935000:
394
395# Exception handler:
396.section .fixup, "ax"
397
3986001:
399	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
400	movl $-EFAULT, (%ebx)
401
402	# zero the complete destination - computing the rest
403	# is too much work
404	movl ARGBASE+8(%esp), %edi	# dst
405	movl ARGBASE+12(%esp), %ecx	# len
406	xorl %eax,%eax
407	rep ; stosb
408
409	jmp 5000b
410
4116002:
412	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
413	movl $-EFAULT,(%ebx)
414	jmp 5000b
415
416.previous
417
418	popl_cfi %ebx
419	CFI_RESTORE ebx
420	popl_cfi %esi
421	CFI_RESTORE esi
422	popl_cfi %edi
423	CFI_RESTORE edi
424	popl_cfi %ecx			# equivalent to addl $4,%esp
425	ret
426	CFI_ENDPROC
427ENDPROC(csum_partial_copy_generic)
428
429#else
430
431/* Version for PentiumII/PPro */
432
433#define ROUND1(x) \
434	SRC(movl x(%esi), %ebx	)	;	\
435	addl %ebx, %eax			;	\
436	DST(movl %ebx, x(%edi)	)	;
437
438#define ROUND(x) \
439	SRC(movl x(%esi), %ebx	)	;	\
440	adcl %ebx, %eax			;	\
441	DST(movl %ebx, x(%edi)	)	;
442
443#define ARGBASE 12
444
445ENTRY(csum_partial_copy_generic)
446	CFI_STARTPROC
447	pushl_cfi %ebx
448	CFI_REL_OFFSET ebx, 0
449	pushl_cfi %edi
450	CFI_REL_OFFSET edi, 0
451	pushl_cfi %esi
452	CFI_REL_OFFSET esi, 0
453	movl ARGBASE+4(%esp),%esi	#src
454	movl ARGBASE+8(%esp),%edi	#dst
455	movl ARGBASE+12(%esp),%ecx	#len
456	movl ARGBASE+16(%esp),%eax	#sum
457#	movl %ecx, %edx
458	movl %ecx, %ebx
459	movl %esi, %edx
460	shrl $6, %ecx
461	andl $0x3c, %ebx
462	negl %ebx
463	subl %ebx, %esi
464	subl %ebx, %edi
465	lea  -1(%esi),%edx
466	andl $-32,%edx
467	lea 3f(%ebx,%ebx), %ebx
468	testl %esi, %esi
469	jmp *%ebx
4701:	addl $64,%esi
471	addl $64,%edi
472	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
473	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
474	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
475	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
476	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)
4773:	adcl $0,%eax
478	addl $64, %edx
479	dec %ecx
480	jge 1b
4814:	movl ARGBASE+12(%esp),%edx	#len
482	andl $3, %edx
483	jz 7f
484	cmpl $2, %edx
485	jb 5f
486SRC(	movw (%esi), %dx         )
487	leal 2(%esi), %esi
488DST(	movw %dx, (%edi)         )
489	leal 2(%edi), %edi
490	je 6f
491	shll $16,%edx
4925:
493SRC(	movb (%esi), %dl         )
494DST(	movb %dl, (%edi)         )
4956:	addl %edx, %eax
496	adcl $0, %eax
4977:
498.section .fixup, "ax"
4996001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr
500	movl $-EFAULT, (%ebx)
501	# zero the complete destination (computing the rest is too much work)
502	movl ARGBASE+8(%esp),%edi	# dst
503	movl ARGBASE+12(%esp),%ecx	# len
504	xorl %eax,%eax
505	rep; stosb
506	jmp 7b
5076002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
508	movl $-EFAULT, (%ebx)
509	jmp  7b
510.previous
511
512	popl_cfi %esi
513	CFI_RESTORE esi
514	popl_cfi %edi
515	CFI_RESTORE edi
516	popl_cfi %ebx
517	CFI_RESTORE ebx
518	ret
519	CFI_ENDPROC
520ENDPROC(csum_partial_copy_generic)
521
522#undef ROUND
523#undef ROUND1
524
525#endif
526