Nginx worker segfault, NULL pool

Hi,

Running nginx on ARM I’m having it segfault at about any request (those
known not to crash are /status/nginx and /status/php-fpm).
Attaching it with GDB I get the following trace:

Program received signal SIGSEGV, Segmentation fault.
ngx_alloc_chain_link (pool=0x0) at src/core/ngx_buf.c:52
52 src/core/ngx_buf.c: No such file or directory.
in src/core/ngx_buf.c
(gdb) backtrace
#0 ngx_alloc_chain_link (pool=0x0) at src/core/ngx_buf.c:52
#1 0x00012290 in ngx_chain_writer (data=0x525b24, in=)
at src/core/ngx_output_chain.c:626
#2 0x0001202c in ngx_output_chain (ctx=0x525ae4, in=0x5260a4) at
src/core/ngx_output_chain.c:66
#3 0x0004a6d0 in ngx_http_upstream_send_request (r=0x524c18,
u=0x525a9c) at src/http/ngx_http_upstream.c:1394
#4 0x0004aeec in ngx_http_upstream_init_request (r=0x524c18) at
src/http/ngx_http_upstream.c:645
#5 ngx_http_upstream_init (r=0x524c18) at
src/http/ngx_http_upstream.c:446
#6 0x000427a4 in ngx_http_read_client_request_body (r=0x524c18,
post_handler=0x4ac80 <ngx_http_upstream_init>) at
src/http/ngx_http_request_body.c:59
#7 0x000612e0 in ngx_http_fastcgi_handler (r=0x524c18) at
src/http/modules/ngx_http_fastcgi_module.c:636
#8 0x00036d18 in ngx_http_core_content_phase (r=0x524c18, ph=0x54ce08)
at src/http/ngx_http_core_module.c:1396
#9 0x00032458 in ngx_http_core_run_phases (r=0x524c18) at
src/http/ngx_http_core_module.c:877
#10 0x00037848 in ngx_http_internal_redirect (r=0x524c18, uri=, args=) at src/http/ngx_http_core_module.c:2545
#11 0x0004dad0 in ngx_http_index_handler (r=0x524c18) at
src/http/modules/ngx_http_index_module.c:277
#12 0x00036d38 in ngx_http_core_content_phase (r=0x524c18, ph=0x54ce08)
at src/http/ngx_http_core_module.c:1403
#13 0x00032458 in ngx_http_core_run_phases (r=0x524c18) at
src/http/ngx_http_core_module.c:877
#14 0x0003bccc in ngx_http_process_request (r=0x524c18) at
src/http/ngx_http_request.c:1688
#15 0x0003c6e0 in ngx_http_process_request_line (rev=0x40a5b10c) at
src/http/ngx_http_request.c:932
#16 0x000397b8 in ngx_http_init_request (rev=0x40a5b10c) at
src/http/ngx_http_request.c:519
#17 0x0002bf70 in ngx_epoll_process_events (cycle=,
timer=, flags=) at
src/event/modules/ngx_epoll_module.c:679
#18 0x00023d0c in ngx_process_events_and_timers (cycle=0x51ec18) at
src/event/ngx_event.c:247
#19 0x0002a278 in ngx_worker_process_cycle (cycle=,
data=) at src/os/unix/ngx_process_cycle.c:806
#20 0x00028920 in ngx_spawn_process (cycle=0x51ec18, proc=0,
data=0x40096918, name=0x69d00 “worker process”, respawn=-3) at
src/os/unix/ngx_process.c:198
#21 0x0002a6f0 in ngx_start_worker_processes (cycle=0x51ec18, n=1,
type=-3) at src/os/unix/ngx_process_cycle.c:365
#22 0x0002acb0 in ngx_master_process_cycle (cycle=0x51ec18) at
src/os/unix/ngx_process_cycle.c:137
#23 0x0000eb64 in main (argc=, argv=) at
src/core/nginx.c:410

System is Gentoo on ARM (armv5tel), nginx -V (applied patch:
forward-ported
ipv6-geoip support patch as attached):

nginx version: nginx/1.2.1
TLS SNI support enabled
configure arguments: --prefix=/usr --conf-path=/etc/nginx/nginx.conf
–error-log-path=/var/log/nginx/error_log --pid-path=/var/run/nginx.pid
–lock-path=/var/lock/nginx.lock --with-cc-opt=-I/usr/include
–with-ld-opt=-L/usr/lib --http-log-path=/var/log/nginx/access_log
–http-client-body-temp-path=/var/tmp/nginx/client
–http-proxy-temp-path=/var/tmp/nginx/proxy
–http-fastcgi-temp-path=/var/tmp/nginx/fastcgi
–http-scgi-temp-path=/var/tmp/nginx/scgi
–http-uwsgi-temp-path=/var/tmp/nginx/uwsgi --with-file-aio
–with-aio_module --with-ipv6 --with-pcre --without-http_browser_module
–without-http_charset_module --without-http_empty_gif_module
–without-http_memcached_module --without-http_proxy_module
–without-http_referer_module --without-http_scgi_module
–without-http_split_clients_module --without-http_userid_module
–without-http_uwsgi_module --with-http_geoip_module
–with-http_stub_status_module --with-http_xslt_module
–with-http_realip_module
–add-module=/var/tmp/portage/www-servers/nginx-1.2.1/work/agentzh-headers-more-nginx-module-3580526
–without-http-cache --with-http_ssl_module --without-mail_imap_module
–without-mail_pop3_module --without-mail_smtp_module --user=nginx
–group=nginx

Having a look at the code it means that
ngx_http_upstream_t->output->filter_ctx->pool is NULL
but still being dereferenced…

I have seen equivalent crash behavior for nginx-1.2.0 (no analysed or
check exact
cause with gdb and debug symbols) on the same host but have not seen
crashes
on an x86 system with 1.2.0.
Note, config might help trigger the issue, quoted below:

############ nginx.conf ###############
user nginx nginx;
daemon off;
worker_processes 1;
worker_cpu_affinity 0001;
worker_rlimit_nofile 65535;

error_log /var/log/nginx/error_log info;

events {
accept_mutex off;
worker_connections 10240;
use epoll;
}

http {
include /etc/nginx/mime.types;
#default_type application/octet-stream;
server_names_hash_bucket_size 64;

    geoip_country  /usr/share/GeoIP/GeoIPv6.dat;

    log_format main
            '$remote_addr $host $remote_user [$time_local] '
            '"$request" $status $bytes_sent '
            '"$http_referer" "$http_user_agent" $request_time 

“$gzip_ratio” -’;
log_format main_ssl
'$remote_addr $host $remote_user [$time_local] ’
'"$request" $status $bytes_sent ’
‘"$http_referer" “$http_user_agent” $request_time
“$gzip_ratio” $ssl_protocol’;

    client_header_timeout 10m;
    client_body_timeout 10m;
    send_timeout 10m;

    connection_pool_size 256;
    client_header_buffer_size 1k;
    large_client_header_buffers 4 2k;
    request_pool_size 4k;

    gzip on;
    gzip_min_length 1100;
    gzip_buffers 4 8k;
    gzip_types text/plain application/xhtml+xml text/css 

application/javascript application/xml application/json;

    output_buffers 1 32k;
    postpone_output 1460;

    sendfile off;
    tcp_nopush on;
    tcp_nodelay on;

    keepalive_timeout 75 20;

    ignore_invalid_headers on;

    index index.html;

    # HTTP catch-all
    server {
listen <local_addr-v4>:80 default_server;
listen [<local_addr-v6]:80 default_server;

            access_log /var/log/nginx/access_log main;
            error_log /var/log/nginx/error_log info;
            rewrite_log off;

            return 410;
    }

    ########## include-1-start ###############

server {
#
# HTTP access not wanted, redirect to HTTPS!
#
listen <local_addr-v4>:80;
listen [<local_addr-v6]:80;
include listen;
server_name armbox.tld;

      access_log /var/log/nginx/sheeva.log main;
      error_log /var/log/nginx/sheeva.elog info;
      rewrite_log off;

      ########## include-2-start ###############
            allow <subnet-v6>; # replaced subnet with placeholder
allow <subnet-v6>;
allow <subnet-v4>;
allow <subnet-v4>;
deny all;

root /home/www/htdocs;
index index.php index.html;

error_page 400 /error400.php;
error_page 403 /error403.php;
error_page 404 /error404.php;
error_page 500 /error500.php;
error_page 502 /error502.php;
error_page 503 /error503.php;

rewrite ^/$     /status.html redirect;

# Status monitoring pages
location ~ ^/status/php-fpm$ {
        include fastcgi_params;
        fastcgi_buffer_size 8k;
        fastcgi_buffers 16 4k;
        fastcgi_param SCRIPT_FILENAME /dev/null;
        fastcgi_param REDIRECT_STATUS 200;
        fastcgi_pass unix:/run/php-fpm/fpm.socket;
}
location = /status/nginx {
        stub_status on;
}

# Remaining pages
location ~ ^/(?<page>.*)\.html$ {
        # Rewrite non-html pages to php
        if (-f $request_filename) { break; }
        if (-f $document_root/$page.php ) { rewrite ^ /$page.php 

last; }
}
rewrite ^/rrdgraph.(png|svg|pdf|eps)$ /rrdgraph.php last;
location ~ ^/(?.*/)?error(?[0-9]+).php$ {
# Handle error pages
if (!-f $document_root/$path/error.php) { rewrite ^
/error404.txt last; }
if ($ecode !~ [0-9]+) { set $ecode 200; }
include fastcgi_params;
fastcgi_buffer_size 8k;
fastcgi_buffers 16 4k;
fastcgi_param SCRIPT_FILENAME
$document_root/$path/error.php;
fastcgi_param REDIRECT_STATUS $ecode;
fastcgi_pass unix:/var/run/php-fpm/fpm.socket;
}
location ~ .php$ {
# Handle PHP pages
if (!-f $request_filename) { rewrite ^ /error404.php last; }
include fastcgi_params;
fastcgi_buffer_size 8k;
fastcgi_buffers 16 4k;
fastcgi_param SCRIPT_FILENAME $request_filename;
fastcgi_param REDIRECT_STATUS 200;
fastcgi_pass unix:/var/run/php-fpm/fpm.socket;
}
location /img/ {
expires 1h;
}
location /css/ {
expires 1h;
}
location /js/ {
expires 1h;
}
########## include-2-end ###############
}
########## include-1-end ###############
}

Hello!

On Fri, Jun 08, 2012 at 02:40:52PM +0200, Bruno Prémont wrote:

Running nginx on ARM I’m having it segfault at about any request (those
known not to crash are /status/nginx and /status/php-fpm).
Attaching it with GDB I get the following trace:

[…]

    geoip_country  /usr/share/GeoIP/GeoIPv6.dat;

Is it works for you if you don’t use GeoIP?

[…]

Maxim D.

Hello Maxim,

Is it works for you if you don’t use GeoIP?

Just disabling it config side makes no difference.

I will try disabling it at configure time and see if it changes
anything, though I doubt it will.

Exact same result when geoip support is not built at all.

Looking more exactly at the URLs I tested, static file like images
don’t crash the worker, just those that get handled by php-fpm upstream
do (e.g. /collectd/ which implies /collectd/index.php).

For the static files the result browser side looks the same, connection
closed before getting any content but that time around nginx logs
something
to error log:
2012/06/08 23:30:42 [alert] 20638#0: *2 pread() read only 400 of 32768
from “/home/www/htdocs/collectd/add.png” while sending response to
client, client: 123.123.123.123, server: arm.tld, request: “GET
/collectd/add.png HTTP/1.1”, host: “arm.tld”

Tough stat on that file returns:
File: `/home/www/htdocs/collectd/add.png’
Size: 400 Blocks: 8 IO Block: 4096 regular file
Device: b302h/45826d Inode: 33 Links: 1
Access: (0644/-rw-r–r--) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2010-02-26 17:31:32.899120476 +0100
Modify: 2010-02-26 17:31:32.899120476 +0100
Change: 2011-05-07 01:19:21.440005750 +0200
Birth: -

Toogling SendFile back to “on” seems to get file access to work again
(a few days ago with 1.2.0 turning it of made it work for a day or
so)???

Bruno

Hello Maxim,

On Fri, 08 June 2012 Maxim D. [email protected] wrote:

On Fri, Jun 08, 2012 at 02:40:52PM +0200, Bruno Prémont wrote:

Running nginx on ARM I’m having it segfault at about any request (those
known not to crash are /status/nginx and /status/php-fpm).
Attaching it with GDB I get the following trace:

[…]

    geoip_country  /usr/share/GeoIP/GeoIPv6.dat;

Is it works for you if you don’t use GeoIP?

Just disabling it config side makes no difference.

I will try disabling it at configure time and see if it changes
anything, though I doubt it will.

Oh, and in case it is important, my fastcgi_params include
some extra options:

fastcgi_param QUERY_STRING $query_string;
fastcgi_param REQUEST_METHOD $request_method;
fastcgi_param CONTENT_TYPE $content_type;
fastcgi_param CONTENT_LENGTH $content_length;

fastcgi_param SCRIPT_NAME $fastcgi_script_name;
fastcgi_param REQUEST_URI $request_uri;
fastcgi_param DOCUMENT_URI $document_uri;
fastcgi_param DOCUMENT_ROOT $document_root;
fastcgi_param SERVER_PROTOCOL $server_protocol;
fastcgi_param HTTPS $https if_not_empty;
fastcgi_param SSL_CIPHER $ssl_cipher if_not_empty;
fastcgi_param SSL_PROTOCOL $ssl_protocol if_not_empty;
fastcgi_param SSL_SESSION_ID $ssl_session_id if_not_empty;
fastcgi_param SSL_CLIENT_SERIAL $ssl_client_serial if_not_empty;
fastcgi_param SSL_CLIENT_S_DN $ssl_client_s_dn if_not_empty;
fastcgi_param SSL_CLIENT_I_DN $ssl_client_i_dn if_not_empty;
fastcgi_param SSL_CLIENT_CERT $ssl_client_cert if_not_empty;
fastcgi_param SSL_CLIENT_RAW_CERT $ssl_client_raw_cert if_not_empty;
fastcgi_param SSL_CLIENT_VERIFY $ssl_client_verify if_not_empty;

disabled as requested

#fastcgi_param GEOIP_COUNTRY_CODE $geoip_country_code;
#fastcgi_param GEOIP_COUNTRY_NAME $geoip_country_name;

fastcgi_param GATEWAY_INTERFACE CGI/1.1;
fastcgi_param SERVER_SOFTWARE nginx/$nginx_version;

fastcgi_param REMOTE_ADDR $remote_addr;
fastcgi_param REMOTE_PORT $remote_port;
fastcgi_param SERVER_ADDR $server_addr;
fastcgi_param SERVER_PORT $server_port;
fastcgi_param SERVER_NAME $server_name;

PHP only, required if PHP was built with --enable-force-cgi-redirect

fastcgi_param REDIRECT_STATUS 200;

[…]

Maxim D.

Bruno

Hallo Maxim,

On Sat, 09 June 2012 Maxim D. [email protected] wrote:

Looking more exactly at the URLs I tested, static file like images
immediate exit on SIGBUS when unaligned access happens. You may
get proper behaviour with

echo 4 > /proc/cpu/alignment

This should allow to trace a root of your problems.

See http://lecs.cs.ucla.edu/wiki/index.php/XScale_alignment for
more details.

Thanks for the pointer, will read trough it!

Seems to be that one, after echoing 4 to /proc/cpu/alignment nginx
does not even start anymore (and nginx -t fails as well), each time
with
SIGBUS.

e.g. for nginx -t the first SIGBUS happens at

#0 0x0000d64c in ngx_set_cpu_affinity (cf=0xbe892358, cmd=, conf=) at src/core/nginx.c:1275
#1 0x0001cafc in ngx_conf_handler (last=13909340, cf=0xbe892358) at
src/core/ngx_conf_file.c:394
#2 ngx_conf_parse (cf=0xbe892358, filename=0xd43d70) at
src/core/ngx_conf_file.c:244
#3 0x0001aba4 in ngx_init_cycle (old_cycle=0xbe8923c0) at
src/core/ngx_cycle.c:268
#4 0x0000e29c in main (argc=, argv=) at
src/core/nginx.c:331

as backtraced with gdb.

Thanks,
Bruno

Hello!

On Fri, Jun 08, 2012 at 11:40:46PM +0200, Bruno Prémont wrote:

    geoip_country  /usr/share/GeoIP/GeoIPv6.dat;

Looking more exactly at the URLs I tested, static file like images
don’t crash the worker, just those that get handled by php-fpm upstream
do (e.g. /collectd/ which implies /collectd/index.php).

You’ve claimed above “/status/php-fpm” works ok too. Is it was
mistake?

Anyway, please make sure you have aligment problems properly
reported by a kernel. It looks like the linux kernel has an
unfortunate default to silently ignore alignment problems on arm,
which results in data corruption on unaligned accesses instead of
immediate exit on SIGBUS when unaligned access happens. You may
get proper behaviour with

echo 4 > /proc/cpu/alignment

This should allow to trace a root of your problems.

See http://lecs.cs.ucla.edu/wiki/index.php/XScale_alignment for
more details.

Maxim D.

Hello!

On Sat, Jun 09, 2012 at 04:54:01PM +0200, Bruno Prémont wrote:

[…]
Exact same result when geoip support is not built at all.
reported by a kernel. It looks like the linux kernel has an
more details.
#1 0x0001cafc in ngx_conf_handler (last=13909340, cf=0xbe892358) at
src/core/ngx_conf_file.c:394
#2 ngx_conf_parse (cf=0xbe892358, filename=0xd43d70) at
src/core/ngx_conf_file.c:244
#3 0x0001aba4 in ngx_init_cycle (old_cycle=0xbe8923c0) at
src/core/ngx_cycle.c:268
#4 0x0000e29c in main (argc=, argv=) at
src/core/nginx.c:331

as backtraced with gdb.

Ok, this looks sensisble.

Could you please provide ./configure output and test if the
following patch fixes things for you?

diff --git a/auto/os/conf b/auto/os/conf
— a/auto/os/conf
+++ b/auto/os/conf
@@ -93,6 +93,7 @@ case “$NGX_MACHINE” in
;;

 *)
  •    have=NGX_ALIGNMENT value=16 . auto/define
       NGX_MACH_CACHE_LINE=32
    
    ;;

Maxim D.

Hello Maxim,

On Sun, 10 June 2012 Maxim D. [email protected] wrote:

Ok, this looks sensisble.
*)

  •    have=NGX_ALIGNMENT value=16 . auto/define
       NGX_MACH_CACHE_LINE=32
    
    ;;

The patch seems to fix things, nginx -t does not die on SIGBUS
anymore,
it also runs properly for the requests that made it fail (
/proc/cpu/alignment does not account any new alignment traps).

Thanks!
Bruno

Full configure output (as well as first few lines of make which shows
used CFLAGS – compiler does not generate any warnings):

checking for OS

  • Linux 2.6.37-00003-g924cf4c armv5tel
    checking for C compiler … found
  • using GNU C compiler
    checking for --with-ld-opt=“-L/usr/lib” … found
    checking for gcc builtin atomic operations … found
    checking for C99 variadic macros … found
    checking for gcc variadic macros … found
    checking for unistd.h … found
    checking for inttypes.h … found
    checking for limits.h … found
    checking for sys/filio.h … not found
    checking for sys/param.h … found
    checking for sys/mount.h … found
    checking for sys/statvfs.h … found
    checking for crypt.h … found
    checking for Linux specific features
    checking for epoll … found
    checking for sendfile() … found
    checking for sendfile64() … found
    checking for sys/prctl.h … found
    checking for prctl(PR_SET_DUMPABLE) … found
    checking for sched_setaffinity() … found
    checking for crypt_r() … found
    checking for sys/vfs.h … found
    checking for poll() … found
    checking for /dev/poll … not found
    checking for kqueue … not found
    checking for crypt() … not found
    checking for crypt() in libcrypt … found
    checking for F_READAHEAD … not found
    checking for posix_fadvise() … found
    checking for O_DIRECT … found
    checking for F_NOCACHE … not found
    checking for directio() … not found
    checking for statfs() … found
    checking for statvfs() … found
    checking for dlopen() … not found
    checking for dlopen() in libdl … found
    checking for sched_yield() … found
    checking for SO_SETFIB … not found
    checking for SO_ACCEPTFILTER … not found
    checking for TCP_DEFER_ACCEPT … found
    checking for TCP_KEEPIDLE, TCP_KEEPINTVL, TCP_KEEPCNT … found
    checking for TCP_INFO … found
    checking for accept4() … found
    checking for kqueue AIO support … not found
    checking for Linux AIO support … found
    checking for int size … 4 bytes
    checking for long size … 4 bytes
    checking for long long size … 8 bytes
    checking for void * size … 4 bytes
    checking for uint64_t … found
    checking for sig_atomic_t … found
    checking for sig_atomic_t size … 4 bytes
    checking for socklen_t … found
    checking for in_addr_t … found
    checking for in_port_t … found
    checking for rlim_t … found
    checking for uintptr_t … uintptr_t found
    checking for system endianess … little endianess
    checking for size_t size … 4 bytes
    checking for off_t size … 8 bytes
    checking for time_t size … 4 bytes
    checking for AF_INET6 … found
    checking for setproctitle() … not found
    checking for pread() … found
    checking for pwrite() … found
    checking for sys_nerr … found
    checking for localtime_r() … found
    checking for posix_memalign() … found
    checking for memalign() … found
    checking for mmap(MAP_ANON|MAP_SHARED) … found
    checking for mmap(“/dev/zero”, MAP_SHARED) … found
    checking for System V shared memory … found
    checking for POSIX semaphores … not found
    checking for POSIX semaphores in libpthread … found
    checking for struct msghdr.msg_control … found
    checking for ioctl(FIONBIO) … found
    checking for struct tm.tm_gmtoff … found
    checking for struct dirent.d_namlen … not found
    checking for struct dirent.d_type … found
    checking for sysconf(_SC_NPROCESSORS_ONLN) … found
    checking for openat(), fstatat() … found
    configuring additional modules
    adding module in
    /var/tmp/portage/www-servers/nginx-1.2.1/work/agentzh-headers-more-nginx-module-3580526
  • ngx_http_headers_more_filter_module was configured
    checking for PCRE library … found
    checking for PCRE JIT support … found
    checking for OpenSSL library … found
    checking for zlib library … found
    checking for libxslt … found
    checking for libexslt … found
    creating objs/Makefile

Configuration summary

  • using system PCRE library
  • using system OpenSSL library
  • md5: using OpenSSL library
  • sha1: using OpenSSL library
  • using system zlib library

nginx path prefix: “/usr”
nginx binary file: “/usr/sbin/nginx”
nginx configuration prefix: “/etc/nginx”
nginx configuration file: “/etc/nginx/nginx.conf”
nginx pid file: “/var/run/nginx.pid”
nginx error log file: “/var/log/nginx/error_log”
nginx http access log file: “/var/log/nginx/access_log”
nginx http client request body temporary files:
“/var/tmp/nginx/client”
nginx http fastcgi temporary files: “/var/tmp/nginx/fastcgi”

make -j2 ‘LINK=armv5tel-softfloat-linux-gnueabi-gcc -Wl,-O1
-Wl,–as-needed’ ‘OTHERLDFLAGS=-Wl,-O1 -Wl,–as-needed’
make -f objs/Makefile
make[1]: Entering directory
`/var/tmp/portage/www-servers/nginx-1.2.1/work/nginx-1.2.1’
armv5tel-softfloat-linux-gnueabi-gcc -c -O2 -march=armv5te -mtune=xscale
-pipe -Wall -ggdb -I/usr/include -I src/core -I src/event -I
src/event/modules -I src/os/unix -I /usr/include/libxml2 -I objs
-o objs/src/core/nginx.o
src/core/nginx.c