You have 300/concurrent connections - try tuning pool size and max db connections larger than that.
Hi Chefs,
I'm using Chef to bootstrap a 300 nodes cluster and a few chef-clients throws the error "503 Service Unavailable" when the 300 nodes started syncing cookbooks at the same time. The chef server log shows erchef crashed. Most of the time, syncing cookbooks on all the nodes again after the 1st failure will succeed.
There is a ticket Chef -4423 Chef Synchronization of Cookbook Slow to speed up syncing cookbooks. I applied the new chef-client which contains the fix. However, the fix doesn't work in my case.
Is there any tips to tune Open Source Chef Server 11 to serve hundreds of nodes ? Thanks a lot.
Here is my chef configuration:
## Chef Server ##
2 cores + 8G Mem
$ cat /etc/issue
CentOS release 5.10 (Final)
$ rpm -q chef-server
chef-server-11.1.1-1.el5
$ cat /etc/chef-server/chef-server.rb
chef_server_webui['enable'] = false
nginx['ssl_port'] = 9443
nginx['non_ssl_port'] = 9080
chef_solr['heap_size'] = 3072
chef_solr['commit_interval'] = 3000
chef_solr['poll_seconds'] = 6
chef_expander['nodes'] = 4
erchef['s3_parallel_ops_timeout'] = 30000
erchef['s3_url_ttl'] = 3600
erchef['db_pool_size'] = 150
postgresql['max_connections'] = 200
rabbitmq['node_port'] = 5672
## Chef Client ##
$ cat /etc/issue
CentOS release 6.4 (Final)
$ rpm -q chef
chef-11.14.0.rc.2-1.el6.x86_64
$ cat /etc/chef/client.rb
log_location STDOUT
chef_server_url "https://<fqdn>:9443"
validation_client_name "chef-validator"
node_name "bigtop-master-0"
no_lazy_load true
ssl_verify_mode :verify_peer
ssl_ca_path "/etc/chef/trusted_certs"
## chef-server-ctl tail ==
full output of "chef-server-ctl tail" is attached.
==> /var/log/chef-server/erchef/crash.log.1 <==
** Last event was: {log,{lager_msg,[],[{application,chef_wm},{module,chef_wm_status},{function,log_failure},{line,77},{pid,"<0.23096.41> '%7D],error,%7B[" target="_blank">"},{node,' '}],error,{["2014",45,"07",45,"20"],["22",58,"40",58,"02",46,"080"]},{1405,896002,80894},[47,95,115,116,97,116,117,115,"\n","{{status,fail},{upstreams,{[{<<\"chef_solr\">>,<<\"fail\">>},{<<\"chef_sql\">>,<<\"pong\">>}]}}}"]}}
** When handler state == {state,{mask,127},lager_default_formatter,["[",severity,"] ",message,"\n"],[]}
** Reason == {badarg,[{io,put_chars,[user,unicode,["[","error","] ",[47,95,115,116,97,116,117,115,"\n","{{status,fail},{upstreams,{[{<<\"chef_solr\">>,<<\"fail\">>},{<<\"chef_sql\">>,<<\"pong\">>}]}}}"],"\n"]],[]},{lager_console_backend,handle_event,2,[{file,"src/lager_console_backend.erl"},{line,95}]},{gen_event,server_update,4,[{file,"gen_event.erl"},{line,504}]},{gen_event,server_notify,4,[{file,"gen_event.erl"},{line,486}]},{gen_event,handle_msg,5,[{file,"gen_event.erl"},{line,248}]},{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}
2014-07-20 22:40:02 =ERROR REPORT====
{<<"method=GET; path=/_status; status=500; ">>,"Internal Server Error"}2014-07-20 22:40:02 =ERROR REPORT====
** gen_event handler lager_console_backend crashed.
** Was installed in lager_event
** Last event was: {log,{lager_msg,[],[{pid,<0.101.0>}],error,{["2014",45,"07",45,"20"],["22",58,"40",58,"02",46,"085"]},{1405,896002,85731},[103,101,110,95,101,118,101,110,116,32,"lager_console_backend",32,105,110,115,116,97,108,108,101,100,32,105,110,32,"lager_event",32,116,101,114,109,105,110,97,116,101,100,32,119,105,116,104,32,114,101,97,115,111,110,58,32,"bad argument in call to io:put_chars(user, unicode, [\"[\",\"error\",\"] \",[47,95,115,116,97,116,117,115,\"\\n\",\"{{status,fail},{upstreams,{[{<<\\\"chef_solr\\\">...\"],...]) in lager_console_backend:handle_event/2 line 95"]}}
** When handler state == {state,{mask,127},lager_default_formatter,["[",severity,"] ",message,"\n"],[]}
** Reason == {badarg,[{io,put_chars,[user,unicode,["[","error","] ",[103,101,110,95,101,118,101,110,116,32,"lager_console_backend",32,105,110,115,116,97,108,108,101,100,32,105,110,32,"lager_event",32,116,101,114,109,105,110,97,116,101,100,32,119,105,116,104,32,114,101,97,115,111,110,58,32,"bad argument in call to io:put_chars(user, unicode, [\"[\",\"error\",\"] \",[47,95,115,116,97,116,117,115,\"\\n\",\"{{status,fail},{upstreams,{[{<<\\\"chef_solr\\\">...\"],...]) in lager_console_backend:handle_event/2 line 95"],"\n"]],[]},{lager_console_backend,handle_event,2,[{file,"src/lager_console_backend.erl"},{line,95}]},{gen_event,server_update,4,[{file,"gen_event.erl"},{line,504}]},{gen_event,server_notify,4,[{file,"gen_event.erl"},{line,486}]},{gen_event,handle_msg,5,[{file,"gen_event.erl"},{line,248}]},{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,227}]}]}
==> /var/log/chef-server/erchef/erchef.log <==
2014-07-22 04:15:48.514 [error] Supervisor stats_hero_worker_sup had child stats_hero started with {stats_hero,start_link,undefined} at <0.30506.3> exit with reason {timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"358","\n",[[["chefAPI",".application.byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".","10",".byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".application.allRequests"],58,"1866",124,"h","\n"],[["chefAPI",".","10",".allRequests"],58,"1866",124,"h","\n"],[["chefAPI",".application.byRequestType.","roles",".","GET"],58,"1866",124,"h","\n"],[["chefAPI",".upstreamRequests.","rdbms"],58,"1834",124,"h","\n"],[["chefAPI",".upstreamReques...",...],...],...]]}]}} in context child_terminated
2014-07-22 04:15:48.600 [error] Supervisor stats_hero_worker_sup had child stats_hero started with {stats_hero,start_link,undefined} at <0.30501.3> exit with reason {timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"356","\n",[[["chefAPI",".application.byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".","10",".byStatusCode.","200"],58,"1",124,"m","\n"],[["chefAPI",".application.allRequests"],58,"2000",124,"h","\n"],[["chefAPI",".","10",".allRequests"],58,"2000",124,"h","\n"],[["chefAPI",".application.byRequestType.","data",".","GET"],58,"2000",124,"h","\n"],[["chefAPI",".upstreamRequests.","rdbms"],58,"1987",124,"h","\n"],[["chefAPI",".upstreamRequest...",...],...],...]]}]}} in context child_terminated
2014-07-22 04:15:48.613 [error] Supervisor stats_hero_worker_sup had child stats_hero started with {stats_hero,start_link,undefined} at <0.30527.3> exit with reason {timeout,{gen_server,call,[<0.925.0>,{send,[49,124,"110","\n",[[["chefAPI",".application.allRequests"],58,"1",124,"m","\n"],[["chefAPI",".","10",".allRequests"],58,"1",124,"m","\n"],[["chefAPI",".application.byRequestType.","data",".","GET"],58,"1",124,"m","\n"]]]}]}} in context child_terminated
2014-07-22 04:16:12.850 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:16:13.010 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:16:13.024 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:17:05.852 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:17:13.356 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:17:13.363 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
2014-07-22 04:17:13.433 [error] {<<"method=POST; path=/environments/_default/cookbook_versions; status=503; ">>,"Service Unavailable"}
Thanks
Jesse Hu
Archive powered by MHonArc 2.6.16.