Hi Chefs,
I'm using Chef to bootstrap a 300 nodes cluster and a few
chef-clients throws the error "503 Service Unavailable" when the
300 nodes started syncing cookbooks at the same time. The chef
server log shows erchef crashed. Most of the time, syncing
cookbooks on all the nodes again after the 1st failure will
succeed.
There is a ticket Chef -4423
Chef
Synchronization of Cookbook Slow to speed up syncing
cookbooks. I applied the new chef-client which contains the fix.
However, the fix doesn't work in my case.
Is there any tips to tune Open Source Chef Server 11 to serve
hundreds of nodes ? Thanks a lot.
Here is my chef configuration:
## Chef Server ##
2 cores + 8G Mem
$ cat /etc/issue
CentOS release 5.10 (Final)
$ rpm -q chef-server
chef-server-11.1.1-1.el5
$ cat /etc/chef-server/chef-server.rb
chef_server_webui['enable'] = false
nginx['ssl_port'] = 9443
nginx['non_ssl_port'] = 9080
chef_solr['heap_size'] = 3072
chef_solr['commit_interval'] = 3000
chef_solr['poll_seconds'] = 6
chef_expander['nodes'] = 4
erchef['s3_parallel_ops_timeout'] = 30000
erchef['s3_url_ttl'] = 3600
erchef['db_pool_size'] = 150
postgresql['max_connections'] = 200
rabbitmq['node_port'] = 5672
## Chef Client ##
$ cat /etc/issue
CentOS release 6.4 (Final)
$ rpm -q chef
chef-11.14.0.rc.2-1.el6.x86_64
$ cat /etc/chef/client.rb
log_location STDOUT
chef_server_url "
https://<fqdn>:9443"
validation_client_name "chef-validator"
node_name "bigtop-master-0"
no_lazy_load true
ssl_verify_mode :verify_peer
ssl_ca_path "/etc/chef/trusted_certs"
## chef-server-ctl tail ==
full output of "chef-server-ctl tail" is attached.
==> /var/log/chef-server/erchef/crash.log.1 <==
** Last event was:
{log,{lager_msg,[],[{application,chef_wm},{module,chef_wm_status},{function,log_failure},{line,77},{pid,"<0.23096.41>
'%7D],error,%7B[" target="_blank">"},{node,'
'}],error,{["2014",45,"07",45,"20"],["22",58,"40",58,"02",46,"080"]},{1405,896002,80894},[47,95,115,116,97,116,117,115,"\n","{{status,fail},{upstreams,{[{<<\"chef_solr\">>,<<\"fail\">>},{<<\"chef_sql\">>,<<\"pong\">>}]}}}"]}}
** When handler state ==
{state,{mask,127},lager_default_formatter,["[",severity,"]
",message,"\n"],[]}
** Reason == {badarg,[{io,put_chars,[user,unicode,["[","error","]
",[47,95,115,116,97,116,117,115,"\n","{{status,fail},{upstreams,{[{<<\"chef_solr\">>,<<\"fail\">>},{<<\"chef_sql\">>,<<\"pong\">>}]}}}"],"\n"]],[]},{lager_console_backend,handle_event,2,[{file,"src/lager_console_backend.erl"},{line,95}]},{gen_event,server_update,4,[{file,"gen_event.erl"},{line,504}]},{gen_event,server_notify,4,[{file,"gen_event.erl"},{line,486}]},{gen_event,handle_msg,5,[{file,"gen_event.erl"},{line,248}]},{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}
2014-07-20 22:40:02 =ERROR REPORT====
{<<"method=GET; path=/_status; status=500;
">>,"Internal Server Error"}2014-07-20 22:40:02 =ERROR
REPORT====
** gen_event handler lager_console_backend crashed.
** Was installed in lager_event
** Last event was:
{log,{lager_msg,[],[{pid,<0.101.0>}],error,{["2014",45,"07",45,"20"],["22",58,"40",58,"02",46,"085"]},{1405,896002,85731},[103,101,110,95,101,118,101,110,116,32,"lager_console_backend",32,105,110,115,116,97,108,108,101,100,32,105,110,32,"lager_event",32,116,101,114,109,105,110,97,116,101,100,32,119,105,116,104,32,114,101,97,115,111,110,58,32,"bad
argument in call to io:put_chars(user, unicode,
[\"[\",\"error\",\"]
\",[47,95,115,116,97,116,117,115,\"\\n\",\"{{status,fail},{upstreams,{[{<<\\\"chef_solr\\\">...\"],...])
in lager_console_backend:handle_event/2 line 95"]}}
** When handler state ==
{state,{mask,127},lager_default_formatter,["[",severity,"]
",message,"\n"],[]}
** Reason == {badarg,[{io,put_chars,[user,unicode,["[","error","]
",[103,101,110,95,101,118,101,110,116,32,"lager_console_backend",32,105,110,115,116,97,108,108,101,100,32,105,110,32,"lager_event",32,116,101,114,109,105,110,97,116,101,100,32,119,105,116,104,32,114,101,97,115,111,110,58,32,"bad
argument in call to io:put_chars(user, unicode,
[\"[\",\"error\",\"]
\",[47,95,115,116,97,116,117,115,\"\\n\",\"{{status,fail},{upstreams,{[{<<\\\"chef_solr\\\">...\"],...])
in lager_console_backend:handle_event/2 line
95"],"\n"]],[]},{lager_console_backend,handle_event,2,[{file,"src/lager_console_backend.erl"},{line,95}]},{gen_event,server_update,4,[{file,"gen_event.erl"},{line,504}]},{gen_event,server_notify,4,[{file,"gen_event.erl"},{line,486}]},{gen_event,handle_msg,5,[{file,"gen_event.erl"},{line,248}]},{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}
==> /var/log/chef-server/erchef/erchef.log <==
2014-07-22 04:15:48.514 [error] Supervisor stats_hero_worker_sup
had child stats_hero started with
{stats_hero,start_link,undefined} at <0.30506.3> exit with
reason
{timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"358","\n",[[["chefAPI",".application.byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".","10",".byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".application.allRequests"],58,"1866",124,"h","\n"],[["chefAPI",".","10",".allRequests"],58,"1866",124,"h","\n"],[["chefAPI",".application.byRequestType.","roles",".","GET"],58,"1866",124,"h","\n"],[["chefAPI",".upstreamRequests.","rdbms"],58,"1834",124,"h","\n"],[["chefAPI",".upstreamReques...",...],...],...]]}]}}
in context child_terminated
2014-07-22 04:15:48.600 [error] Supervisor stats_hero_worker_sup
had child stats_hero started with
{stats_hero,start_link,undefined} at <0.30501.3> exit with
reason
{timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"356","\n",[[["chefAPI",".application.byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".","10",".byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".application.allRequests"],58,"2000",124,"h","\n"],[["chefAPI",".","10",".allRequests"],58,"2000",124,"h","\n"],[["chefAPI",".application.byRequestType.","data",".","GET"],58,"2000",124,"h","\n"],[["chefAPI",".upstreamRequests.","rdbms"],58,"1987",124,"h","\n"],[["chefAPI",".upstreamRequest...",...],...],...]]}]}}
in context child_terminated
2014-07-22 04:15:48.613 [error] Supervisor stats_hero_worker_sup
had child stats_hero started with
{stats_hero,start_link,undefined} at <0.30527.3> exit with
reason
{timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"110","\n",[[["chefAPI",".application.allRequests"],58,"1",124,"m","\n"],[["chefAPI",".","10",".allRequests"],58,"1",124,"m","\n"],[["chefAPI",".application.byRequestType.","data",".","GET"],58,"1",124,"m","\n"]]]}]}}
in context child_terminated
2014-07-22 04:16:12.850 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:16:13.010 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:16:13.024 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:17:05.852 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:17:13.356 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:17:13.363 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
2014-07-22 04:17:13.433 [error] {<<"method=POST;
path=/environments/_default/cookbook_versions; status=503;
">>,"Service Unavailable"}
Thanks
Jesse Hu