mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-05 07:57:45 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
8
Documentation/prctl/Makefile
Normal file
8
Documentation/prctl/Makefile
Normal file
|
@ -0,0 +1,8 @@
|
|||
# List of programs to build
|
||||
hostprogs-$(CONFIG_X86) := disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test disable-tsc-test
|
||||
# Tell kbuild to always build the programs
|
||||
always := $(hostprogs-y)
|
||||
|
||||
HOSTCFLAGS_disable-tsc-ctxt-sw-stress-test.o += -I$(objtree)/usr/include
|
||||
HOSTCFLAGS_disable-tsc-on-off-stress-test.o += -I$(objtree)/usr/include
|
||||
HOSTCFLAGS_disable-tsc-test.o += -I$(objtree)/usr/include
|
97
Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
Normal file
97
Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
Normal file
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
|
||||
*
|
||||
* Tests if the control register is updated correctly
|
||||
* at context switches
|
||||
*
|
||||
* Warning: this test will cause a very high load for a few seconds
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <inttypes.h>
|
||||
#include <wait.h>
|
||||
|
||||
|
||||
#include <sys/prctl.h>
|
||||
#include <linux/prctl.h>
|
||||
|
||||
/* Get/set the process' ability to use the timestamp counter instruction */
|
||||
#ifndef PR_GET_TSC
|
||||
#define PR_GET_TSC 25
|
||||
#define PR_SET_TSC 26
|
||||
# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
|
||||
# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
|
||||
#endif
|
||||
|
||||
static uint64_t rdtsc(void)
|
||||
{
|
||||
uint32_t lo, hi;
|
||||
/* We cannot use "=A", since this would use %rax on x86_64 */
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return (uint64_t)hi << 32 | lo;
|
||||
}
|
||||
|
||||
static void sigsegv_expect(int sig)
|
||||
{
|
||||
/* */
|
||||
}
|
||||
|
||||
static void segvtask(void)
|
||||
{
|
||||
if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
|
||||
{
|
||||
perror("prctl");
|
||||
exit(0);
|
||||
}
|
||||
signal(SIGSEGV, sigsegv_expect);
|
||||
alarm(10);
|
||||
rdtsc();
|
||||
fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
static void sigsegv_fail(int sig)
|
||||
{
|
||||
fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static void rdtsctask(void)
|
||||
{
|
||||
if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
|
||||
{
|
||||
perror("prctl");
|
||||
exit(0);
|
||||
}
|
||||
signal(SIGSEGV, sigsegv_fail);
|
||||
alarm(10);
|
||||
for(;;) rdtsc();
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_tasks = 100, i;
|
||||
|
||||
fprintf(stderr, "[No further output means we're allright]\n");
|
||||
|
||||
for (i=0; i<n_tasks; i++)
|
||||
if (fork() == 0)
|
||||
{
|
||||
if (i & 1)
|
||||
segvtask();
|
||||
else
|
||||
rdtsctask();
|
||||
}
|
||||
|
||||
for (i=0; i<n_tasks; i++)
|
||||
wait(NULL);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
96
Documentation/prctl/disable-tsc-on-off-stress-test.c
Normal file
96
Documentation/prctl/disable-tsc-on-off-stress-test.c
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
|
||||
*
|
||||
* Tests if the control register is updated correctly
|
||||
* when set with prctl()
|
||||
*
|
||||
* Warning: this test will cause a very high load for a few seconds
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <inttypes.h>
|
||||
#include <wait.h>
|
||||
|
||||
|
||||
#include <sys/prctl.h>
|
||||
#include <linux/prctl.h>
|
||||
|
||||
/* Get/set the process' ability to use the timestamp counter instruction */
|
||||
#ifndef PR_GET_TSC
|
||||
#define PR_GET_TSC 25
|
||||
#define PR_SET_TSC 26
|
||||
# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
|
||||
# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
|
||||
#endif
|
||||
|
||||
/* snippet from wikipedia :-) */
|
||||
|
||||
static uint64_t rdtsc(void)
|
||||
{
|
||||
uint32_t lo, hi;
|
||||
/* We cannot use "=A", since this would use %rax on x86_64 */
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return (uint64_t)hi << 32 | lo;
|
||||
}
|
||||
|
||||
int should_segv = 0;
|
||||
|
||||
static void sigsegv_cb(int sig)
|
||||
{
|
||||
if (!should_segv)
|
||||
{
|
||||
fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
|
||||
exit(0);
|
||||
}
|
||||
if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
|
||||
{
|
||||
perror("prctl");
|
||||
exit(0);
|
||||
}
|
||||
should_segv = 0;
|
||||
|
||||
rdtsc();
|
||||
}
|
||||
|
||||
static void task(void)
|
||||
{
|
||||
signal(SIGSEGV, sigsegv_cb);
|
||||
alarm(10);
|
||||
for(;;)
|
||||
{
|
||||
rdtsc();
|
||||
if (should_segv)
|
||||
{
|
||||
fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
|
||||
exit(0);
|
||||
}
|
||||
if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
|
||||
{
|
||||
perror("prctl");
|
||||
exit(0);
|
||||
}
|
||||
should_segv = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_tasks = 100, i;
|
||||
|
||||
fprintf(stderr, "[No further output means we're allright]\n");
|
||||
|
||||
for (i=0; i<n_tasks; i++)
|
||||
if (fork() == 0)
|
||||
task();
|
||||
|
||||
for (i=0; i<n_tasks; i++)
|
||||
wait(NULL);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
95
Documentation/prctl/disable-tsc-test.c
Normal file
95
Documentation/prctl/disable-tsc-test.c
Normal file
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
|
||||
*
|
||||
* Basic test to test behaviour of PR_GET_TSC and PR_SET_TSC
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
|
||||
#include <sys/prctl.h>
|
||||
#include <linux/prctl.h>
|
||||
|
||||
/* Get/set the process' ability to use the timestamp counter instruction */
|
||||
#ifndef PR_GET_TSC
|
||||
#define PR_GET_TSC 25
|
||||
#define PR_SET_TSC 26
|
||||
# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
|
||||
# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
|
||||
#endif
|
||||
|
||||
const char *tsc_names[] =
|
||||
{
|
||||
[0] = "[not set]",
|
||||
[PR_TSC_ENABLE] = "PR_TSC_ENABLE",
|
||||
[PR_TSC_SIGSEGV] = "PR_TSC_SIGSEGV",
|
||||
};
|
||||
|
||||
static uint64_t rdtsc(void)
|
||||
{
|
||||
uint32_t lo, hi;
|
||||
/* We cannot use "=A", since this would use %rax on x86_64 */
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return (uint64_t)hi << 32 | lo;
|
||||
}
|
||||
|
||||
static void sigsegv_cb(int sig)
|
||||
{
|
||||
int tsc_val = 0;
|
||||
|
||||
printf("[ SIG_SEGV ]\n");
|
||||
printf("prctl(PR_GET_TSC, &tsc_val); ");
|
||||
fflush(stdout);
|
||||
|
||||
if ( prctl(PR_GET_TSC, &tsc_val) == -1)
|
||||
perror("prctl");
|
||||
|
||||
printf("tsc_val == %s\n", tsc_names[tsc_val]);
|
||||
printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
|
||||
fflush(stdout);
|
||||
if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
|
||||
perror("prctl");
|
||||
|
||||
printf("rdtsc() == ");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int tsc_val = 0;
|
||||
|
||||
signal(SIGSEGV, sigsegv_cb);
|
||||
|
||||
printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
|
||||
printf("prctl(PR_GET_TSC, &tsc_val); ");
|
||||
fflush(stdout);
|
||||
|
||||
if ( prctl(PR_GET_TSC, &tsc_val) == -1)
|
||||
perror("prctl");
|
||||
|
||||
printf("tsc_val == %s\n", tsc_names[tsc_val]);
|
||||
printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
|
||||
printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
|
||||
fflush(stdout);
|
||||
|
||||
if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
|
||||
perror("prctl");
|
||||
|
||||
printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
|
||||
printf("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)\n");
|
||||
fflush(stdout);
|
||||
|
||||
if ( prctl(PR_SET_TSC, PR_TSC_SIGSEGV) == -1)
|
||||
perror("prctl");
|
||||
|
||||
printf("rdtsc() == ");
|
||||
fflush(stdout);
|
||||
printf("%llu\n", (unsigned long long)rdtsc());
|
||||
fflush(stdout);
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
57
Documentation/prctl/no_new_privs.txt
Normal file
57
Documentation/prctl/no_new_privs.txt
Normal file
|
@ -0,0 +1,57 @@
|
|||
The execve system call can grant a newly-started program privileges that
|
||||
its parent did not have. The most obvious examples are setuid/setgid
|
||||
programs and file capabilities. To prevent the parent program from
|
||||
gaining these privileges as well, the kernel and user code must be
|
||||
careful to prevent the parent from doing anything that could subvert the
|
||||
child. For example:
|
||||
|
||||
- The dynamic loader handles LD_* environment variables differently if
|
||||
a program is setuid.
|
||||
|
||||
- chroot is disallowed to unprivileged processes, since it would allow
|
||||
/etc/passwd to be replaced from the point of view of a process that
|
||||
inherited chroot.
|
||||
|
||||
- The exec code has special handling for ptrace.
|
||||
|
||||
These are all ad-hoc fixes. The no_new_privs bit (since Linux 3.5) is a
|
||||
new, generic mechanism to make it safe for a process to modify its
|
||||
execution environment in a manner that persists across execve. Any task
|
||||
can set no_new_privs. Once the bit is set, it is inherited across fork,
|
||||
clone, and execve and cannot be unset. With no_new_privs set, execve
|
||||
promises not to grant the privilege to do anything that could not have
|
||||
been done without the execve call. For example, the setuid and setgid
|
||||
bits will no longer change the uid or gid; file capabilities will not
|
||||
add to the permitted set, and LSMs will not relax constraints after
|
||||
execve.
|
||||
|
||||
To set no_new_privs, use prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0).
|
||||
|
||||
Be careful, though: LSMs might also not tighten constraints on exec
|
||||
in no_new_privs mode. (This means that setting up a general-purpose
|
||||
service launcher to set no_new_privs before execing daemons may
|
||||
interfere with LSM-based sandboxing.)
|
||||
|
||||
Note that no_new_privs does not prevent privilege changes that do not
|
||||
involve execve. An appropriately privileged task can still call
|
||||
setuid(2) and receive SCM_RIGHTS datagrams.
|
||||
|
||||
There are two main use cases for no_new_privs so far:
|
||||
|
||||
- Filters installed for the seccomp mode 2 sandbox persist across
|
||||
execve and can change the behavior of newly-executed programs.
|
||||
Unprivileged users are therefore only allowed to install such filters
|
||||
if no_new_privs is set.
|
||||
|
||||
- By itself, no_new_privs can be used to reduce the attack surface
|
||||
available to an unprivileged user. If everything running with a
|
||||
given uid has no_new_privs set, then that uid will be unable to
|
||||
escalate its privileges by directly attacking setuid, setgid, and
|
||||
fcap-using binaries; it will need to compromise something without the
|
||||
no_new_privs bit set first.
|
||||
|
||||
In the future, other potentially dangerous kernel features could become
|
||||
available to unprivileged tasks if no_new_privs is set. In principle,
|
||||
several options to unshare(2) and clone(2) would be safe when
|
||||
no_new_privs is set, and no_new_privs + chroot is considerable less
|
||||
dangerous than chroot by itself.
|
225
Documentation/prctl/seccomp_filter.txt
Normal file
225
Documentation/prctl/seccomp_filter.txt
Normal file
|
@ -0,0 +1,225 @@
|
|||
SECure COMPuting with filters
|
||||
=============================
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
A large number of system calls are exposed to every userland process
|
||||
with many of them going unused for the entire lifetime of the process.
|
||||
As system calls change and mature, bugs are found and eradicated. A
|
||||
certain subset of userland applications benefit by having a reduced set
|
||||
of available system calls. The resulting set reduces the total kernel
|
||||
surface exposed to the application. System call filtering is meant for
|
||||
use with those applications.
|
||||
|
||||
Seccomp filtering provides a means for a process to specify a filter for
|
||||
incoming system calls. The filter is expressed as a Berkeley Packet
|
||||
Filter (BPF) program, as with socket filters, except that the data
|
||||
operated on is related to the system call being made: system call
|
||||
number and the system call arguments. This allows for expressive
|
||||
filtering of system calls using a filter program language with a long
|
||||
history of being exposed to userland and a straightforward data set.
|
||||
|
||||
Additionally, BPF makes it impossible for users of seccomp to fall prey
|
||||
to time-of-check-time-of-use (TOCTOU) attacks that are common in system
|
||||
call interposition frameworks. BPF programs may not dereference
|
||||
pointers which constrains all filters to solely evaluating the system
|
||||
call arguments directly.
|
||||
|
||||
What it isn't
|
||||
-------------
|
||||
|
||||
System call filtering isn't a sandbox. It provides a clearly defined
|
||||
mechanism for minimizing the exposed kernel surface. It is meant to be
|
||||
a tool for sandbox developers to use. Beyond that, policy for logical
|
||||
behavior and information flow should be managed with a combination of
|
||||
other system hardening techniques and, potentially, an LSM of your
|
||||
choosing. Expressive, dynamic filters provide further options down this
|
||||
path (avoiding pathological sizes or selecting which of the multiplexed
|
||||
system calls in socketcall() is allowed, for instance) which could be
|
||||
construed, incorrectly, as a more complete sandboxing solution.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
An additional seccomp mode is added and is enabled using the same
|
||||
prctl(2) call as the strict seccomp. If the architecture has
|
||||
CONFIG_HAVE_ARCH_SECCOMP_FILTER, then filters may be added as below:
|
||||
|
||||
PR_SET_SECCOMP:
|
||||
Now takes an additional argument which specifies a new filter
|
||||
using a BPF program.
|
||||
The BPF program will be executed over struct seccomp_data
|
||||
reflecting the system call number, arguments, and other
|
||||
metadata. The BPF program must then return one of the
|
||||
acceptable values to inform the kernel which action should be
|
||||
taken.
|
||||
|
||||
Usage:
|
||||
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog);
|
||||
|
||||
The 'prog' argument is a pointer to a struct sock_fprog which
|
||||
will contain the filter program. If the program is invalid, the
|
||||
call will return -1 and set errno to EINVAL.
|
||||
|
||||
If fork/clone and execve are allowed by @prog, any child
|
||||
processes will be constrained to the same filters and system
|
||||
call ABI as the parent.
|
||||
|
||||
Prior to use, the task must call prctl(PR_SET_NO_NEW_PRIVS, 1) or
|
||||
run with CAP_SYS_ADMIN privileges in its namespace. If these are not
|
||||
true, -EACCES will be returned. This requirement ensures that filter
|
||||
programs cannot be applied to child processes with greater privileges
|
||||
than the task that installed them.
|
||||
|
||||
Additionally, if prctl(2) is allowed by the attached filter,
|
||||
additional filters may be layered on which will increase evaluation
|
||||
time, but allow for further decreasing the attack surface during
|
||||
execution of a process.
|
||||
|
||||
The above call returns 0 on success and non-zero on error.
|
||||
|
||||
Return values
|
||||
-------------
|
||||
A seccomp filter may return any of the following values. If multiple
|
||||
filters exist, the return value for the evaluation of a given system
|
||||
call will always use the highest precedent value. (For example,
|
||||
SECCOMP_RET_KILL will always take precedence.)
|
||||
|
||||
In precedence order, they are:
|
||||
|
||||
SECCOMP_RET_KILL:
|
||||
Results in the task exiting immediately without executing the
|
||||
system call. The exit status of the task (status & 0x7f) will
|
||||
be SIGSYS, not SIGKILL.
|
||||
|
||||
SECCOMP_RET_TRAP:
|
||||
Results in the kernel sending a SIGSYS signal to the triggering
|
||||
task without executing the system call. siginfo->si_call_addr
|
||||
will show the address of the system call instruction, and
|
||||
siginfo->si_syscall and siginfo->si_arch will indicate which
|
||||
syscall was attempted. The program counter will be as though
|
||||
the syscall happened (i.e. it will not point to the syscall
|
||||
instruction). The return value register will contain an arch-
|
||||
dependent value -- if resuming execution, set it to something
|
||||
sensible. (The architecture dependency is because replacing
|
||||
it with -ENOSYS could overwrite some useful information.)
|
||||
|
||||
The SECCOMP_RET_DATA portion of the return value will be passed
|
||||
as si_errno.
|
||||
|
||||
SIGSYS triggered by seccomp will have a si_code of SYS_SECCOMP.
|
||||
|
||||
SECCOMP_RET_ERRNO:
|
||||
Results in the lower 16-bits of the return value being passed
|
||||
to userland as the errno without executing the system call.
|
||||
|
||||
SECCOMP_RET_TRACE:
|
||||
When returned, this value will cause the kernel to attempt to
|
||||
notify a ptrace()-based tracer prior to executing the system
|
||||
call. If there is no tracer present, -ENOSYS is returned to
|
||||
userland and the system call is not executed.
|
||||
|
||||
A tracer will be notified if it requests PTRACE_O_TRACESECCOMP
|
||||
using ptrace(PTRACE_SETOPTIONS). The tracer will be notified
|
||||
of a PTRACE_EVENT_SECCOMP and the SECCOMP_RET_DATA portion of
|
||||
the BPF program return value will be available to the tracer
|
||||
via PTRACE_GETEVENTMSG.
|
||||
|
||||
The tracer can skip the system call by changing the syscall number
|
||||
to -1. Alternatively, the tracer can change the system call
|
||||
requested by changing the system call to a valid syscall number. If
|
||||
the tracer asks to skip the system call, then the system call will
|
||||
appear to return the value that the tracer puts in the return value
|
||||
register.
|
||||
|
||||
The seccomp check will not be run again after the tracer is
|
||||
notified. (This means that seccomp-based sandboxes MUST NOT
|
||||
allow use of ptrace, even of other sandboxed processes, without
|
||||
extreme care; ptracers can use this mechanism to escape.)
|
||||
|
||||
SECCOMP_RET_ALLOW:
|
||||
Results in the system call being executed.
|
||||
|
||||
If multiple filters exist, the return value for the evaluation of a
|
||||
given system call will always use the highest precedent value.
|
||||
|
||||
Precedence is only determined using the SECCOMP_RET_ACTION mask. When
|
||||
multiple filters return values of the same precedence, only the
|
||||
SECCOMP_RET_DATA from the most recently installed filter will be
|
||||
returned.
|
||||
|
||||
Pitfalls
|
||||
--------
|
||||
|
||||
The biggest pitfall to avoid during use is filtering on system call
|
||||
number without checking the architecture value. Why? On any
|
||||
architecture that supports multiple system call invocation conventions,
|
||||
the system call numbers may vary based on the specific invocation. If
|
||||
the numbers in the different calling conventions overlap, then checks in
|
||||
the filters may be abused. Always check the arch value!
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
The samples/seccomp/ directory contains both an x86-specific example
|
||||
and a more generic example of a higher level macro interface for BPF
|
||||
program generation.
|
||||
|
||||
|
||||
|
||||
Adding architecture support
|
||||
-----------------------
|
||||
|
||||
See arch/Kconfig for the authoritative requirements. In general, if an
|
||||
architecture supports both ptrace_event and seccomp, it will be able to
|
||||
support seccomp filter with minor fixup: SIGSYS support and seccomp return
|
||||
value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
||||
to its arch-specific Kconfig.
|
||||
|
||||
|
||||
|
||||
Caveats
|
||||
-------
|
||||
|
||||
The vDSO can cause some system calls to run entirely in userspace,
|
||||
leading to surprises when you run programs on different machines that
|
||||
fall back to real syscalls. To minimize these surprises on x86, make
|
||||
sure you test with
|
||||
/sys/devices/system/clocksource/clocksource0/current_clocksource set to
|
||||
something like acpi_pm.
|
||||
|
||||
On x86-64, vsyscall emulation is enabled by default. (vsyscalls are
|
||||
legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities:
|
||||
|
||||
- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
|
||||
the vsyscall entry for the given call and not the address after the
|
||||
'syscall' instruction. Any code which wants to restart the call
|
||||
should be aware that (a) a ret instruction has been emulated and (b)
|
||||
trying to resume the syscall will again trigger the standard vsyscall
|
||||
emulation security checks, making resuming the syscall mostly
|
||||
pointless.
|
||||
|
||||
- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
|
||||
but the syscall may not be changed to another system call using the
|
||||
orig_rax register. It may only be changed to -1 order to skip the
|
||||
currently emulated call. Any other change MAY terminate the process.
|
||||
The rip value seen by the tracer will be the syscall entry address;
|
||||
this is different from normal behavior. The tracer MUST NOT modify
|
||||
rip or rsp. (Do not rely on other changes terminating the process.
|
||||
They might work. For example, on some kernels, choosing a syscall
|
||||
that only exists in future kernels will be correctly emulated (by
|
||||
returning -ENOSYS).
|
||||
|
||||
To detect this quirky behavior, check for addr & ~0x0C00 ==
|
||||
0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For
|
||||
SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other
|
||||
condition: future kernels may improve vsyscall emulation and current
|
||||
kernels in vsyscall=native mode will behave differently, but the
|
||||
instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
|
||||
cases.
|
||||
|
||||
Note that modern systems are unlikely to use vsyscalls at all -- they
|
||||
are a legacy feature and they are considerably slower than standard
|
||||
syscalls. New code will use the vDSO, and vDSO-issued system calls
|
||||
are indistinguishable from normal system calls.
|
Loading…
Add table
Add a link
Reference in a new issue