niXforums Forum Index
 FAQFAQ   SearchSearch   MemberlistMemberlist   UsergroupsUsergroups   RegisterRegister 
 ProfileProfile   PreferencesPreferences   Log in to check your private messagesLog in to check your private messages   Log inLog in 
·  nixdoc.net ·  man pages ·  Linux HOWTOs ·  FreeBSD Tips ·  Forums
navigation Forum index » *nix » BSD » FreeBSD » mail-lists » Architecture
wiring the sysctl output buffer
Post new topic   Reply to topic Page 1 of 1 [13 Posts] View previous topic :: View next topic
Author Message
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Sat Jul 20, 2002 9:39 pm    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 17 Jul, Don Lewis wrote:

Quote:
My initial solution was to allocate a temporary buffer, but that
complicates the code and the similar code might have to be added to
other parts of the kernel where similar situations might arise. It was
not until later that I thought of pre-wiring the buffer, which looks
like a much cleaner solution. In the latter case, I decided that it
would be best to abstact the interface so that the handler doesn't need
to become intimate with the internals of the sysctl system. If this
solution allows us to avoid the call to vslock() in the majority of
cases, then so much the better.

The initial implementation of the interface to vslock() would probably
just ignore the size argument, but defining the API this way would allow
it to be used later if necessary without the necessity of retrofitting
the handlers that use it.

Attached is another iteration of the patch. I added a length parameter
to the vslock() wrapper, though this parameter is currently ignored. I
modified the standard handlers to either make a temporary copy of the
data or to wire the buffer as appropriate. I also added an assertion to
sysctl_old_user() to catch calls to SYSCTL_OUT() that may block while
locks are held. This assertion has already turned up some other calls
to SYSCTL_OUT() where locks are held.

This patch also includes the sysctl_kern_function_list() fix.

What's the scoop with CTLFLAG_NOLOCK? Should it be used to retrieve
static data? If so, should the default handlers skip the extra copy if
this flag is set?


Index: kern_sysctl.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.127
diff -u -r1.127 kern_sysctl.c
--- kern_sysctl.c 15 Jul 2002 17:28:34 -0000 1.127
+++ kern_sysctl.c 20 Jul 2002 21:10:42 -0000
@@ -57,6 +57,7 @@

static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
+static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");

/*
* Locking - this locks the sysctl tree in memory.
@@ -751,12 +752,16 @@
int
sysctl_handle_int(SYSCTL_HANDLER_ARGS)
{
- int error = 0;
+ int tmpout, error = 0;

+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
if (arg1)
- error = SYSCTL_OUT(req, arg1, sizeof(int));
+ tmpout = *(int *)arg1;
else
- error = SYSCTL_OUT(req, &arg2, sizeof(int));
+ tmpout = arg2;
+ error = SYSCTL_OUT(req, &tmpout, sizeof(int));

if (error || !req->newptr)
return (error);
@@ -776,10 +781,15 @@
sysctl_handle_long(SYSCTL_HANDLER_ARGS)
{
int error = 0;
+ long tmpout;

+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
if (!arg1)
return (EINVAL);
- error = SYSCTL_OUT(req, arg1, sizeof(long));
+ tmpout = *(long *)arg1;
+ error = SYSCTL_OUT(req, &tmpout, sizeof(long));

if (error || !req->newptr)
return (error);
@@ -799,8 +809,23 @@
sysctl_handle_string(SYSCTL_HANDLER_ARGS)
{
int error=0;
+ char *tmparg;
+ size_t outlen;

- error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+ /*
+ * Attempt to get a coherent snapshot by copying to a
+ * temporary kernel buffer.
+ */
+retry:
+ outlen = strlen((char *)arg1)+1;
+ tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
+ strncpy(tmparg, (char *)arg1, outlen);
+ if (tmparg[outlen-1] != '\0') {
+ free(tmparg, M_SYSCTLTMP);
+ goto retry;
+ }
+ error = SYSCTL_OUT(req, tmparg, outlen);
+ free(tmparg, M_SYSCTLTMP);

if (error || !req->newptr)
return (error);
@@ -825,8 +850,23 @@
sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
{
int error;
+ void *tmparg;

- error = SYSCTL_OUT(req, arg1, arg2);
+ /*
+ * Attempt to get a coherent snapshot, either by wiring the
+ * user space buffer or copying to a temporary kernel buffer
+ * depending on the size of the data.
+ */
+ if (arg2 > PAGE_SIZE) {
+ sysctl_wire_old_buffer(req, arg2);
+ error = SYSCTL_OUT(req, arg1, arg2);
+ } else {
+ tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
+ strcpy(tmparg, (char *)arg1);
+ bcopy(arg1, tmparg, arg2);
+ error = SYSCTL_OUT(req, tmparg, arg2);
+ free(tmparg, M_SYSCTLTMP);
+ }

if (error || !req->newptr)
return (error);
@@ -953,10 +993,8 @@
int error = 0;
size_t i = 0;

- if (req->lock == 1 && req->oldptr) {
- vslock(req->oldptr, req->oldlen);
- req->lock = 2;
- }
+ if (req->lock == 1 && req->oldptr)
+ WITNESS_SLEEP(1, NULL);
if (req->oldptr) {
i = l;
if (req->oldlen <= req->oldidx)
@@ -988,6 +1026,23 @@
error = copyin((char *)req->newptr + req->newidx, p, l);
req->newidx += l;
return (error);
+}
+
+/*
+ * Wire the user space destination buffer. If set to a value greater than
+ * zero, the len parameter limits the maximum amount of wired memory.
+ *
+ * XXX - The len parameter is currently ignored due to the lack of
+ * a place to save it in the sysctl_req structure so that the matching
+ * amount of memory can be unwired in the sysctl exit code.
+ */
+void
+sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
+{
+ if (req->lock == 1 && req->oldptr && req->oldfunc == sysctl_old_user) {
+ vslock(req->oldptr, req->oldlen);
+ req->lock = 2;
+ }
}

int
Index: kern_linker.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_linker.c,v
retrieving revision 1.91
diff -u -r1.91 kern_linker.c
--- kern_linker.c 7 Jul 2002 22:35:47 -0000 1.91
+++ kern_linker.c 17 Jul 2002 10:37:50 -0000
@@ -1794,6 +1794,7 @@
linker_file_t lf;
int error;

+ sysctl_wire_old_buffer(req, 0);
mtx_lock(&kld_mtx);
TAILQ_FOREACH(lf, &linker_files, link) {
error = LINKER_EACH_FUNCTION_NAME(lf,







To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Wed Jul 17, 2002 11:30 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 17 Jul, Don Lewis wrote:
Quote:
On 17 Jul, Bruce Evans wrote:
I think no time should be spent changing interfaces for this. The
following hack might work well: use the current code in sysctl_old_user()
if req->oldlen is not huge. Otherwise, vslock() only the region being
copied out to and vsunlock() it after the copy so that we don't have to
keep track of what is vslock()ed.

That doesn't fix sysctl_kern_function_list(), which grabs a lock and
then walks a linked list, calling SYSCTL_OUT() on each element.

Two other places that we have the similar problems are tcp_pcblist() and
udp_pcblist(). This code appears to have other problems as well, such
as the possibility of looking at stale data on the free list ...


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Wed Jul 17, 2002 7:55 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 17 Jul, Bruce Evans wrote:
Quote:
On Tue, 16 Jul 2002, Don Lewis wrote:

On 14 Jul, Bruce Evans wrote:
...
`oldlen' is known ahead of time, so a large enough buffer could be
allocated in most cases. Perhaps vslock() iff oldlen is very large.
The data could still change while it is being copied to a malloc()'ed
buffer (or earlier) if the kernel is preemptible.

I basically like this approach. In most cases the data will be a small
and of fixed size, so the handler could even store the temporary copy on
the stack. If the data is a type that can be copied atomically, like an
int, we don't have to worry about the data changing in mid-copy. If it

It's not clear that ints can be copied atomically unless a suitable lock
is held. Longs can't be on some arches (i386's with 64-bit longs for
example Smile.

I believe that ints can be copied atomically by assignment if they are
naturally aligned and nobody is doing byte writes to them. It's would
be likely that longs could be copied atomically as long as neither the
the reader nor the writer of the variable in question were preempted by
interrupts in the middle of the two halves of their operations. I
wouldn't want to make any guarantees without a lot of careful study.
It's not likely that you could get an atomic snapshot with bcopy() or
copyout() without doing explicit locking.

Quote:
is larger and we care about getting an atomic snapshot, the handler can
lock the source for the duration of the copy to the temporary buffer.

For large amounts of data, we probably want to avoid copying to a
temporary buffer. PHK raised the issue of the user specifying a buffer
size that is way too large. Allocating a huge buffer in wired down
kernel memory sounds even worse than wiring down a huge buffer in user
space.

I think this is a small problem provided you only make the buffer large
enough to hold what the kernel is supplying in the current SYSCTL_OUT()
and not what the application is willing to accept for the whole sysctl.
There may be no suppliers of huge kernel data except KERN_VNODE, and
KERN_VNODE was turned off because it was too hard to supply that much
data correctly even years ago when there were fewer vnodes and less
concurrency than now. Any suppliers of huge kernel data may have the
same problems.

As long as we know the amount of data ahead of time ...

Quote:
If we don't want to allocate a temporary buffer of length oldlen because
it is much larger than what looks to be reasonable, but the amount of

OK, but allocating if it is <= PAGE_SIZE or small enough to put on the
stack should handle most cases and hopefully not signficantly complicate
the problem of keeping track of what you allocated.

That sounds about right since that's probably about the crossover point
between the expense of the extra copy versus the call to vslock().

Quote:
data is not easy to calculate ahead of time, we have another potential
problem. We might have to lock a data structure, walk through this
data structure and calculate the total data size, and unlock the data
structure. After we malloc() the buffer, relock the data structure, and
start traversing the data structure again to do the copy, we may
discover that the amount of data to be returned has increased, so we
would have to drop the lock, free the buffer, and start over again.

Also, the size might turn out to be too huge to allocate. You would then
need to backtrack and recalulate the size using a less ambitious algorithm.

I think the callers of SYSCTL_OUT() don't need a new API, but they
should supply the data in a form suitable for copying out directly.
This probably involves them making a copy while holding suitable
domain-specific locks. They can't just point to an active kernel
struct since it might change.

I would just make a copy at a high level for now.

I'd vote for a hybrid approach. I would remove the vslock() call from
sysctl_old_user() and provide the new API for invoking it from the
handler if appropriate. Instead, I would call WITNESS_SLEEP() if
vslock() had not been called to wire the buffer to guard against
blocking while locks are held. I would keep the call to vsunlock() in
the sysctl cleanup code.

It's never really appropriate, but I guess you need it for a few sysctls
because they are too large to fix all at once. I hope the new API won't
be used much. Sysctls with fixed locking can keep using the current code
since they will have copied the data to a safe place and not hold any
locks when they call SYSCTL_OUT() or care if SYSCTL_OUT() makes another
copy or blocks.

I suspect that there are very few places where this would be used. A
very large percentage of the sysctl calls use the standard handlers and
small data types whose size is known ahead of time. All of these can be
modified to make a temporary copy of the data and can avoid the vslock()
overhead.

Quote:
I'd also like to provide a second argument to the interface to vslock()
to allow the handler to specify a maximum, kernel enforced, buffer
length to potentially limit the amount of wired memory to something less
than oldlen, but unless we add a member to the sysctl_req structure to
hold it, we would need to overwrite oldlen so that we can pass the
proper value to vsunlock() in the cleanup code. What kind impact would
changing sysctl_req have on binary compatibility? I doesn't look like a
problem to me, since this structure seems to be mostly treated as an
opaque type.

Changing its size would probably not be good for some modules.

I think no time should be spent changing interfaces for this. The
following hack might work well: use the current code in sysctl_old_user()
if req->oldlen is not huge. Otherwise, vslock() only the region being
copied out to and vsunlock() it after the copy so that we don't have to
keep track of what is vslock()ed.

That doesn't fix sysctl_kern_function_list(), which grabs a lock and
then walks a linked list, calling SYSCTL_OUT() on each element. In this
case we can't allow SYSCTL_OUT() to block in either vslock() or
copyout(). The incremental vslock() solution also doesn't really solve
any problems if the calls to vslock() and vsunlock() are done in
sysctl_old_user(). They neither guarantee us a coherent copy if the
source buffer is dynamic and unlocked (though they might increase the
probability), and they also prevent SYSCTL_OUT() being called while a
lock is held.

My initial solution was to allocate a temporary buffer, but that
complicates the code and the similar code might have to be added to
other parts of the kernel where similar situations might arise. It was
not until later that I thought of pre-wiring the buffer, which looks
like a much cleaner solution. In the latter case, I decided that it
would be best to abstact the interface so that the handler doesn't need
to become intimate with the internals of the sysctl system. If this
solution allows us to avoid the call to vslock() in the majority of
cases, then so much the better.

The initial implementation of the interface to vslock() would probably
just ignore the size argument, but defining the API this way would allow
it to be used later if necessary without the necessity of retrofitting
the handlers that use it.



To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Bruce Evans
*nix forums Guru Wannabe


Joined: 22 Mar 2002
Posts: 190

PostPosted: Wed Jul 17, 2002 5:20 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On Tue, 16 Jul 2002, Don Lewis wrote:

Quote:
On 14 Jul, Bruce Evans wrote:
...
`oldlen' is known ahead of time, so a large enough buffer could be
allocated in most cases. Perhaps vslock() iff oldlen is very large.
The data could still change while it is being copied to a malloc()'ed
buffer (or earlier) if the kernel is preemptible.

I basically like this approach. In most cases the data will be a small
and of fixed size, so the handler could even store the temporary copy on
the stack. If the data is a type that can be copied atomically, like an
int, we don't have to worry about the data changing in mid-copy. If it

It's not clear that ints can be copied atomically unless a suitable lock
is held. Longs can't be on some arches (i386's with 64-bit longs for
example Smile.

Quote:
is larger and we care about getting an atomic snapshot, the handler can
lock the source for the duration of the copy to the temporary buffer.

For large amounts of data, we probably want to avoid copying to a
temporary buffer. PHK raised the issue of the user specifying a buffer
size that is way too large. Allocating a huge buffer in wired down
kernel memory sounds even worse than wiring down a huge buffer in user
space.

I think this is a small problem provided you only make the buffer large
enough to hold what the kernel is supplying in the current SYSCTL_OUT()
and not what the application is willing to accept for the whole sysctl.
There may be no suppliers of huge kernel data except KERN_VNODE, and
KERN_VNODE was turned off because it was too hard to supply that much
data correctly even years ago when there were fewer vnodes and less
concurrency than now. Any suppliers of huge kernel data may have the
same problems.

Quote:
If we don't want to allocate a temporary buffer of length oldlen because
it is much larger than what looks to be reasonable, but the amount of

OK, but allocating if it is <= PAGE_SIZE or small enough to put on the
stack should handle most cases and hopefully not signficantly complicate
the problem of keeping track of what you allocated.

Quote:
data is not easy to calculate ahead of time, we have another potential
problem. We might have to lock a data structure, walk through this
data structure and calculate the total data size, and unlock the data
structure. After we malloc() the buffer, relock the data structure, and
start traversing the data structure again to do the copy, we may
discover that the amount of data to be returned has increased, so we
would have to drop the lock, free the buffer, and start over again.

Also, the size might turn out to be too huge to allocate. You would then
need to backtrack and recalulate the size using a less ambitious algorithm.

Quote:
I think the callers of SYSCTL_OUT() don't need a new API, but they
should supply the data in a form suitable for copying out directly.
This probably involves them making a copy while holding suitable
domain-specific locks. They can't just point to an active kernel
struct since it might change.

I would just make a copy at a high level for now.

I'd vote for a hybrid approach. I would remove the vslock() call from
sysctl_old_user() and provide the new API for invoking it from the
handler if appropriate. Instead, I would call WITNESS_SLEEP() if
vslock() had not been called to wire the buffer to guard against
blocking while locks are held. I would keep the call to vsunlock() in
the sysctl cleanup code.

It's never really appropriate, but I guess you need it for a few sysctls
because they are too large to fix all at once. I hope the new API won't
be used much. Sysctls with fixed locking can keep using the current code
since they will have copied the data to a safe place and not hold any
locks when they call SYSCTL_OUT() or care if SYSCTL_OUT() makes another
copy or blocks.

Quote:
I'd also like to provide a second argument to the interface to vslock()
to allow the handler to specify a maximum, kernel enforced, buffer
length to potentially limit the amount of wired memory to something less
than oldlen, but unless we add a member to the sysctl_req structure to
hold it, we would need to overwrite oldlen so that we can pass the
proper value to vsunlock() in the cleanup code. What kind impact would
changing sysctl_req have on binary compatibility? I doesn't look like a
problem to me, since this structure seems to be mostly treated as an
opaque type.

Changing its size would probably not be good for some modules.

I think no time should be spent changing interfaces for this. The
following hack might work well: use the current code in sysctl_old_user()
if req->oldlen is not huge. Otherwise, vslock() only the region being
copied out to and vsunlock() it after the copy so that we don't have to
keep track of what is vslock()ed.

Bruce


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Tue Jul 16, 2002 9:54 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 14 Jul, Bruce Evans wrote:
Quote:
On Sun, 14 Jul 2002, Don Lewis wrote:

A number of places in the kernel call SYSCTL_OUT() while holding locks.
The problem is that SYSCTL_OUT() can block while it wires down the "old"
buffer in user space. If the data is small, such as an integer value,

This seems to have been broken by FreeBSD. In rev.1.1, vslock() was
called at the start of the sysctl, so the only problem with it was its
pessimalness (*). Now I think the vslock() is just a pessimization,
since the point of doing it was to avoid blocking during the copyout(),
but blocking during vslock() is equivalent. Also, vslock() is insufficent
if the kernel is preemptible.

Yes, in some ways it looks like we went backwards. We can solve the
problem of the data changing while it is being copied by locking the
source of the data, but we need to call vslock() before locking the
source of the data to avoid being blocked in vslock() while holding
locks. The current implematation which calls vslock() in the first
call to SYSCTL_OUT() is the reason that I'm looking at this stuff.

Quote:
the best solution may be to copy the data and defer calling SYSCTL_OUT()
until after the locks are released. This extra copy could be wasteful
if the data is large, and it may be cumbersome if the data size is not
known ahead of time, since determining the data size may be expensive.

The extra copy is unlikely to be more wasteful than vslock()'ing, since
vfslock() is a very expensive operation. Some instruction counts on
a UP i386 kernel with no INVARIANTS or WITNESS:

(*) vslock(): about 2700 instructions (for a length of just 16)
vsunlock(): about 1400
useracc(): about 400

`oldlen' is known ahead of time, so a large enough buffer could be
allocated in most cases. Perhaps vslock() iff oldlen is very large.
The data could still change while it is being copied to a malloc()'ed
buffer (or earlier) if the kernel is preemptible.

I basically like this approach. In most cases the data will be a small
and of fixed size, so the handler could even store the temporary copy on
the stack. If the data is a type that can be copied atomically, like an
int, we don't have to worry about the data changing in mid-copy. If it
is larger and we care about getting an atomic snapshot, the handler can
lock the source for the duration of the copy to the temporary buffer.

For large amounts of data, we probably want to avoid copying to a
temporary buffer. PHK raised the issue of the user specifying a buffer
size that is way too large. Allocating a huge buffer in wired down
kernel memory sounds even worse than wiring down a huge buffer in user
space.

If we don't want to allocate a temporary buffer of length oldlen because
it is much larger than what looks to be reasonable, but the amount of
data is not easy to calculate ahead of time, we have another potential
problem. We might have to lock a data structure, walk through this
data structure and calculate the total data size, and unlock the data
structure. After we malloc() the buffer, relock the data structure, and
start traversing the data structure again to do the copy, we may
discover that the amount of data to be returned has increased, so we
would have to drop the lock, free the buffer, and start over again.

Quote:
The data size could also potentially change between the time the size is
calculatated and the time when the data is copied to the temporary
buffer if the locks must be released so that MALLOC() can be called to
allocate the buffer.

Hopefully the sysctl locking is enough to prevent at least *req changing
underneath us.

I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

Here's a patch that implements this new API and illustrates how it can
be used to fix the kern.function_list sysctl, which wants to call
SYSCTL_OUT() multiple times while walking a locked data structure. I
think this is the best fix, though I'd like some feedback on whether
this is the best API.

I think the callers of SYSCTL_OUT() don't need a new API, but they
should supply the data in a form suitable for copying out directly.
This probably involves them making a copy while holding suitable
domain-specific locks. They can't just point to an active kernel
struct since it might change.

I would just make a copy at a high level for now.

I'd vote for a hybrid approach. I would remove the vslock() call from
sysctl_old_user() and provide the new API for invoking it from the
handler if appropriate. Instead, I would call WITNESS_SLEEP() if
vslock() had not been called to wire the buffer to guard against
blocking while locks are held. I would keep the call to vsunlock() in
the sysctl cleanup code.

If the handler makes a copy of the data and releases any locks before
calling SYSCTL_OUT(), then we can avoid the expense of vslock(). If
appropriate, the handler would still have the ability to wire the buffer
before grabbing a lock, and then invoke SYSCTL_OUT() while the lock is
held.

I'd also like to provide a second argument to the interface to vslock()
to allow the handler to specify a maximum, kernel enforced, buffer
length to potentially limit the amount of wired memory to something less
than oldlen, but unless we add a member to the sysctl_req structure to
hold it, we would need to overwrite oldlen so that we can pass the
proper value to vsunlock() in the cleanup code. What kind impact would
changing sysctl_req have on binary compatibility? I doesn't look like a
problem to me, since this structure seems to be mostly treated as an
opaque type.




To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Bruce Evans
*nix forums Guru Wannabe


Joined: 22 Mar 2002
Posts: 190

PostPosted: Sun Jul 14, 2002 11:35 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On Sun, 14 Jul 2002, Don Lewis wrote:

Quote:
A number of places in the kernel call SYSCTL_OUT() while holding locks.
The problem is that SYSCTL_OUT() can block while it wires down the "old"
buffer in user space. If the data is small, such as an integer value,

This seems to have been broken by FreeBSD. In rev.1.1, vslock() was
called at the start of the sysctl, so the only problem with it was its
pessimalness (*). Now I think the vslock() is just a pessimization,
since the point of doing it was to avoid blocking during the copyout(),
but blocking during vslock() is equivalent. Also, vslock() is insufficent
if the kernel is preemptible.

Quote:
the best solution may be to copy the data and defer calling SYSCTL_OUT()
until after the locks are released. This extra copy could be wasteful
if the data is large, and it may be cumbersome if the data size is not
known ahead of time, since determining the data size may be expensive.

The extra copy is unlikely to be more wasteful than vslock()'ing, since
vfslock() is a very expensive operation. Some instruction counts on
a UP i386 kernel with no INVARIANTS or WITNESS:

(*) vslock(): about 2700 instructions (for a length of just 16)
vsunlock(): about 1400
useracc(): about 400

`oldlen' is known ahead of time, so a large enough buffer could be
allocated in most cases. Perhaps vslock() iff oldlen is very large.
The data could still change while it is being copied to a malloc()'ed
buffer (or earlier) if the kernel is preemptible.

Quote:
The data size could also potentially change between the time the size is
calculatated and the time when the data is copied to the temporary
buffer if the locks must be released so that MALLOC() can be called to
allocate the buffer.

Hopefully the sysctl locking is enough to prevent at least *req changing
underneath us.

Quote:
I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

Here's a patch that implements this new API and illustrates how it can
be used to fix the kern.function_list sysctl, which wants to call
SYSCTL_OUT() multiple times while walking a locked data structure. I
think this is the best fix, though I'd like some feedback on whether
this is the best API.

I think the callers of SYSCTL_OUT() don't need a new API, but they
should supply the data in a form suitable for copying out directly.
This probably involves them making a copy while holding suitable
domain-specific locks. They can't just point to an active kernel
struct since it might change.

I would just make a copy at a high level for now.

Bruce


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Sun Jul 14, 2002 10:45 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 14 Jul, Poul-Henning Kamp wrote:
Quote:
In message <200207141020.g6EAK4wr020922@gw.catspoiler.org>, Don Lewis writes:
On 14 Jul, Poul-Henning Kamp wrote:

You may also want to restrict the amount of buffer you pin
if I hand the kernel a 2G buffer for a 4 byte sysctl integer
you wouldn't want to pin it all.

Yeah, that could be bad, but the SYSCTL_OUT() in the current API pins
the whole user supplied buffer on the first call. My new entry point
does the same thing, but allows the caller to do this potentially
blocking operation earlier.

I know, but that also affords an opportunity to be smarter (ie: letting
the entry point say "I need this much storage").

I like this idea, though we'd want to move the new interface to the
recommended list. There is also the problem keeping track of how much
memory we wired. The current sysctl exit code unwires the entire user
supplied buffer. If we only wire a portion, then we would need to add a
member to the sysctl_req structure to record how much was wired so that
we could unwire that amount on exit.



To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Sun Jul 14, 2002 10:20 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 14 Jul, Poul-Henning Kamp wrote:
Quote:
In message <200207140954.g6E9sBwr020599@gw.catspoiler.org>, Don Lewis writes:
On 14 Jul, Poul-Henning Kamp wrote:

It used to be that sysctl unconditionally wired the output buffer,
but that gave rise to a host of other problems.

Anything specific that I should be aware of?

No, just bad deadlocks and such.

You may also want to restrict the amount of buffer you pin
if I hand the kernel a 2G buffer for a 4 byte sysctl integer
you wouldn't want to pin it all.

Yeah, that could be bad, but the SYSCTL_OUT() in the current API pins
the whole user supplied buffer on the first call. My new entry point
does the same thing, but allows the caller to do this potentially
blocking operation earlier.



To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Sun Jul 14, 2002 9:54 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

On 14 Jul, Poul-Henning Kamp wrote:
Quote:
In message <200207140916.g6E9Ghwr020552@gw.catspoiler.org>, Don Lewis writes:

I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

It used to be that sysctl unconditionally wired the output buffer,
but that gave rise to a host of other problems.

Anything specific that I should be aware of?

Quote:
I'm not against the suggested change, but it should be used with
caution.

I'd only want to use this when it was the best solution, preferably
after checking that SYSCTL_OUT() would actually be called.



To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Poul-Henning Kamp
*nix forums Guru


Joined: 21 Mar 2002
Posts: 436

PostPosted: Sun Jul 14, 2002 9:16 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

In message <200207141020.g6EAK4wr020922@gw.catspoiler.org>, Don Lewis writes:
Quote:
On 14 Jul, Poul-Henning Kamp wrote:
In message <200207140954.g6E9sBwr020599@gw.catspoiler.org>, Don Lewis writes:
On 14 Jul, Poul-Henning Kamp wrote:

It used to be that sysctl unconditionally wired the output buffer,
but that gave rise to a host of other problems.

Anything specific that I should be aware of?

No, just bad deadlocks and such.

You may also want to restrict the amount of buffer you pin
if I hand the kernel a 2G buffer for a 4 byte sysctl integer
you wouldn't want to pin it all.

Yeah, that could be bad, but the SYSCTL_OUT() in the current API pins
the whole user supplied buffer on the first call. My new entry point
does the same thing, but allows the caller to do this potentially
blocking operation earlier.

I know, but that also affords an opportunity to be smarter (ie: letting
the entry point say "I need this much storage").

--
Poul-Henning Kamp | UNIX since Zilog Zeus 3.20
phk@FreeBSD.ORG | TCP/IP since RFC 956
FreeBSD committer | BSD since 4.3-tahoe
Never attribute to malice what can adequately be explained by incompetence.

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Poul-Henning Kamp
*nix forums Guru


Joined: 21 Mar 2002
Posts: 436

PostPosted: Sun Jul 14, 2002 9:16 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

In message <200207140954.g6E9sBwr020599@gw.catspoiler.org>, Don Lewis writes:
Quote:
On 14 Jul, Poul-Henning Kamp wrote:
In message <200207140916.g6E9Ghwr020552@gw.catspoiler.org>, Don Lewis writes:

I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

It used to be that sysctl unconditionally wired the output buffer,
but that gave rise to a host of other problems.

Anything specific that I should be aware of?

No, just bad deadlocks and such.

You may also want to restrict the amount of buffer you pin
if I hand the kernel a 2G buffer for a 4 byte sysctl integer
you wouldn't want to pin it all.

--
Poul-Henning Kamp | UNIX since Zilog Zeus 3.20
phk@FreeBSD.ORG | TCP/IP since RFC 956
FreeBSD committer | BSD since 4.3-tahoe
Never attribute to malice what can adequately be explained by incompetence.

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Poul-Henning Kamp
*nix forums Guru


Joined: 21 Mar 2002
Posts: 436

PostPosted: Sun Jul 14, 2002 9:16 am    Post subject: Re: wiring the sysctl output buffer Reply with quote

In message <200207140916.g6E9Ghwr020552@gw.catspoiler.org>, Don Lewis writes:

Quote:
I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

It used to be that sysctl unconditionally wired the output buffer,
but that gave rise to a host of other problems.

I'm not against the suggested change, but it should be used with
caution.

--
Poul-Henning Kamp | UNIX since Zilog Zeus 3.20
phk@FreeBSD.ORG | TCP/IP since RFC 956
FreeBSD committer | BSD since 4.3-tahoe
Never attribute to malice what can adequately be explained by incompetence.

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Don Lewis
*nix forums beginner


Joined: 10 Jul 2002
Posts: 43

PostPosted: Sun Jul 14, 2002 9:16 am    Post subject: wiring the sysctl output buffer Reply with quote

A number of places in the kernel call SYSCTL_OUT() while holding locks.
The problem is that SYSCTL_OUT() can block while it wires down the "old"
buffer in user space. If the data is small, such as an integer value,
the best solution may be to copy the data and defer calling SYSCTL_OUT()
until after the locks are released. This extra copy could be wasteful
if the data is large, and it may be cumbersome if the data size is not
known ahead of time, since determining the data size may be expensive.
The data size could also potentially change between the time the size is
calculatated and the time when the data is copied to the temporary
buffer if the locks must be released so that MALLOC() can be called to
allocate the buffer.

I think the best solution to this problem is to add a sysctl system API
to prewire the output buffer that can be called before grabbing the
locks. Doing so allows the existing code to operate properly with only
minimal changes.

Here's a patch that implements this new API and illustrates how it can
be used to fix the kern.function_list sysctl, which wants to call
SYSCTL_OUT() multiple times while walking a locked data structure. I
think this is the best fix, though I'd like some feedback on whether
this is the best API.

Index: sys/sysctl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sysctl.h,v
retrieving revision 1.105
diff -u -r1.105 sysctl.h
--- sys/sysctl.h 16 May 2002 21:28:26 -0000 1.105
+++ sys/sysctl.h 14 Jul 2002 07:57:29 -0000
@@ -595,6 +595,7 @@
size_t *retval);
int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
int *nindx, struct sysctl_req *req);
+void sysctl_wire_old_buffer(struct sysctl_req *req);

#else /* !_KERNEL */
#include <sys/cdefs.h>
Index: kern/kern_sysctl.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.126
diff -u -r1.126 kern_sysctl.c
--- kern/kern_sysctl.c 29 Jun 2002 02:00:01 -0000 1.126
+++ kern/kern_sysctl.c 14 Jul 2002 07:57:50 -0000
@@ -990,6 +990,18 @@
return (error);
}

+/*
+ * Lock the user space destination buffer.
+ */
+void
+sysctl_wire_old_buffer(struct sysctl_req *req)
+{
+ if (req->lock == 1 && req->oldptr && req->oldfunc == sysctl_old_user) {
+ vslock(req->oldptr, req->oldlen);
+ req->lock = 2;
+ }
+}
+
int
sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
int *nindx, struct sysctl_req *req)
Index: kern/kern_linker.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_linker.c,v
retrieving revision 1.91
diff -u -r1.91 kern_linker.c
--- kern/kern_linker.c 7 Jul 2002 22:35:47 -0000 1.91
+++ kern/kern_linker.c 14 Jul 2002 07:58:38 -0000
@@ -1794,6 +1794,7 @@
linker_file_t lf;
int error;

+ sysctl_wire_old_buffer(req);
mtx_lock(&kld_mtx);
TAILQ_FOREACH(lf, &linker_files, link) {
error = LINKER_EACH_FUNCTION_NAME(lf,





To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Back to top
Google

Back to top
Display posts from previous:   
Post new topic   Reply to topic Page 1 of 1 [13 Posts] View previous topic :: View next topic
The time now is Thu Jan 08, 2009 6:14 am | All times are GMT
navigation Forum index » *nix » BSD » FreeBSD » mail-lists » Architecture
Jump to:  

Similar Topics
Topic Author Forum Replies Last Post
No new posts mailq output format bacilko1 Postfix 0 Tue Oct 03, 2006 12:34 pm
No new posts How do I render JPEG Data stored in char* buffer? On the Sparrow C++ 2 Thu Jul 20, 2006 8:44 pm
No new posts syslog Output Change Maja AIX 0 Thu Jul 20, 2006 8:41 pm
No new posts output number mm.omid@gmail.com C++ 1 Thu Jul 20, 2006 5:09 pm
No new posts capture the output ciccio@unical.it python 2 Wed Jul 19, 2006 3:48 pm

Online Loans | Credit Card | Babb Fest | Pacotes TurĂ­sticos | Bankruptcy
Copyright © 2004-2005 DeniX Solutions SRL
 
Other DeniX Solutions sites: Unix/Linux blog |  electronics forum |  medicine forum |  science forum | 
Privacy Policy


Powered by phpBB © 2001, 2005 phpBB Group
[ Time: 0.2957s ][ Queries: 20 (0.0757s) ][ GZIP on - Debug on ]