ENOMEM error with plenty of free memory (was Crash on perlbal 1.53, possible Foundry influence)

Brett G. Durrett brett at imvu.com
Tue Feb 20 21:52:51 UTC 2007


I finally got a useful strace from a dying perlbal 1.53.  It looks like 
a request to mmap2 for 130K of memory is failing.  The odd thing is the 
host has more than three gigs of memory available (MemFree), so I am not 
sure why it would fail to allocate the memory.  The strace below is from 
kernel 2.6.8-2-686, stock Debian kernel running Perlbal version 1.53.  I 
am also seeing these failures on Perlbal versions 1.42 and 1.52 running 
on a custom 2.6.16 kernel.  I have two machines in this pool that do not 
die like the six others, these are running a custom 2.6.19 kernel in 
64-bit and they are running Perlbal 1.41.

The failure pattern for these machines is usually a rotating failure 
where one will fail and then another will fail a few minutes later 
(rotation could be related to load balancing) and then there will be a 
quiet period of 15-20 minutes with no failures before the cycle begins 
again.   Restarting the Perlbal process does not seem to help but, 
possibly coincidence, a reboot of four of the six failing servers 
eliminated the failures in our most recent cycle, after 2 hours of 
problems.  The cycle will also stop on its own and seems to appear at 
random times, most commonly when we are not under heavy loads.

I am hoping to dig into this more when I get some free time... sharing 
with the list in case this rings a bell with anybody...


time(NULL)                              = 1171950537
gettimeofday({1171950537, 835082}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=507, u64=507}}}, 1000, 1000) = 1
time(NULL)                              = 1171950537
read(507, "<?xml version=\'1.0\'?>\n<methodCal"..., 1372) = 1372
epoll_ctl(5, EPOLL_CTL_MOD, 507, {EPOLLERR|EPOLLHUP|0x20, {u32=507, 
u64=507}}) = 0
time(NULL)                              = 1171950537
setsockopt(196, SOL_TCP, TCP_CORK, [1], 4) = 0
write(196, "POST /catalog/skudb/test3.php HT"..., 335) = 335
setsockopt(196, SOL_TCP, TCP_CORK, [0], 4) = 0
write(196, "<?xml version=\'1.0\'?>\n<methodCal"..., 1372) = 1372
time(NULL)                              = 1171950537
setsockopt(507, SOL_TCP, TCP_CORK, [1], 4) = 0
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 836400}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=867, u64=867}}}, 1000, 1000) = 1
time(NULL)                              = 1171950537
read(867, "2/maDGSiR\nQO0NyL0FBwKmtb2yrJ/0kT"..., 131072) = 1460
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 836689}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=1857, u64=1857}}}, 1000, 1000) = 1
read(1857, "HTTP/1.1 304 Not Modified\r\nDate:"..., 102400) = 338
write(1242, "HTTP/1.0 304 Not Modified\r\nDate:"..., 339) = 339
time(NULL)                              = 1171950537
setsockopt(1242, SOL_TCP, TCP_CORK, [0], 4) = 0
epoll_ctl(5, EPOLL_CTL_MOD, 1242, {EPOLLIN|EPOLLERR|EPOLLHUP|0x20, 
{u32=1242, u64=1242}}) = 0
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 838553}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=1268, u64=1268}}}, 1000, 1000) = 1
time(NULL)                              = 1171950537
read(1268, "POST /catalog/skudb/test3.php HT"..., 102400) = 361
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 839257}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=212, u64=212}}, 
{EPOLLERR|EPOLLHUP|0x807a000, {u32=134717464, u64=133629452197912}}}, 
1000, 1000) = 2
time(NULL)                              = 1171950537
read(212, "POST /catalog/skudb/test3.php HT"..., 102400) = 361
time(NULL)                              = 1171950537
*mmap2(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 
-1, 0) = -1 ENOMEM (Cannot allocate memory)*
brk(0)                                  = 0x3ff90000
brk(0x3ffd0000)                         = 0x3ffd0000
read(867, "Kx3SrfAHKJAk\nf2VUc7dddUEKLC3R9hs"..., 131072) = 1460
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 842823}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=1579, u64=1579}}}, 1000, 1000) = 1
read(1579, "HTTP/1.1 304 Not Modified\r\nDate:"..., 102400) = 338
write(1265, "HTTP/1.0 304 Not Modified\r\nDate:"..., 339) = 339
time(NULL)                              = 1171950537
epoll_ctl(5, EPOLL_CTL_DEL, 1579, {EPOLLIN|EPOLLERR|EPOLLHUP|0x20, 
{u32=1579, u64=1579}}) = 0
time(NULL)                              = 1171950537
setsockopt(1265, SOL_TCP, TCP_CORK, [0], 4) = 0
epoll_ctl(5, EPOLL_CTL_MOD, 1265, {EPOLLIN|EPOLLERR|EPOLLHUP|0x20, 
{u32=1265, u64=1265}}) = 0
close(1579)                             = 0
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 848627}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=225, u64=225}}}, 1000, 1000) = 1
read(225, "HTTP/1.1 200 OK\r\nDate: Tue, 20 F"..., 102400) = 2967
write(447, "HTTP/1.0 200 OK\r\nDate: Tue, 20 F"..., 450) = 450
write(447, "<?xml version=\"1.0\" encoding=\"UT"..., 2517) = 2517
time(NULL)                              = 1171950537
epoll_ctl(5, EPOLL_CTL_DEL, 225, {EPOLLIN|EPOLLERR|EPOLLHUP|0x20, 
{u32=225, u64=225}}) = 0
time(NULL)                              = 1171950537
setsockopt(447, SOL_TCP, TCP_CORK, [0], 4) = 0
epoll_ctl(5, EPOLL_CTL_MOD, 447, {EPOLLIN|EPOLLERR|EPOLLHUP|0x20, 
{u32=447, u64=447}}) = 0
close(225)                              = 0
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 850399}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=867, u64=867}}}, 1000, 1000) = 1
time(NULL)                              = 1171950537
read(867, "9NDuQvO9A2Cdq3T\nVWt5Wf5ils86u/9N"..., 131072) = 1460
time(NULL)                              = 1171950537
time(NULL)                              = 1171950537
gettimeofday({1171950537, 850847}, NULL) = 0
epoll_wait(5, {{EPOLLIN, {u32=867, u64=867}}}, 1000, 1000) = 1
time(NULL)                              = 1171950537
*mmap2(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 
-1, 0) = -1 ENOMEM (Cannot allocate memory)*
brk(0)                                  = 0x3ffd0000
brk(0x40010000)                         = 0x3ffd0000
*mmap2(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 
-1, 0) = -1 ENOMEM (Cannot allocate memory)*
*write(2, "Out of memory!\n", 15)        = 15*
write(8, "\270\365\322\10", 4)          = 4
write(8, "\20\334\244\10", 4)           = 4
write(8, " 8\266\10", 4)                = 4
select(10, [9], NULL, NULL, NULL)       = 1 (in [9])
read(9, "\270\365\322\10", 4)           = 4
read(9, "\20\334\244\10", 4)            = 4
read(9, " 8\266\10", 4)                 = 4
read(9, 0xbffff908, 4)                  = -1 EAGAIN (Resource 
temporarily unavailable)
close(1702)                             = 0
close(1743)                             = 0
close(1704)                             = 0
close(1664)                             = 0
close(1406)                             = 0



-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.danga.com/pipermail/perlbal/attachments/20070220/7309581e/attachment.htm


More information about the perlbal mailing list