Fix shutdown hangs.
We would occasionally hang or a while during server shudown, this is due to an outbound peer waiting on a connection or a sleep. However, we don't actually require to wait for the peers to finish at all. So just let them finish. Secondly, make peer.disconnnect and server.shutdown atomic varaibles so that checking them from multiple goroutines isn't race, and clean up their usage.
This commit is contained in:
parent
7eb1a5ce4e
commit
98109dbadd
2 changed files with 36 additions and 40 deletions
40
peer.go
40
peer.go
|
@ -18,6 +18,7 @@ import (
|
|||
"net"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
|
@ -101,7 +102,7 @@ type peer struct {
|
|||
na *btcwire.NetAddress
|
||||
timeConnected time.Time
|
||||
inbound bool
|
||||
disconnect bool
|
||||
disconnect int32 // only to be used atomically
|
||||
persistent bool
|
||||
versionKnown bool
|
||||
knownAddresses map[string]bool
|
||||
|
@ -116,7 +117,6 @@ type peer struct {
|
|||
requestQueue *list.List
|
||||
invSendQueue *list.List
|
||||
continueHash *btcwire.ShaHash
|
||||
wg sync.WaitGroup
|
||||
outputQueue chan btcwire.Message
|
||||
outputInvChan chan *btcwire.InvVect
|
||||
blockProcessed chan bool
|
||||
|
@ -692,7 +692,7 @@ func (p *peer) handleAddrMsg(msg *btcwire.MsgAddr) {
|
|||
|
||||
for _, na := range msg.AddrList {
|
||||
// Don't add more address if we're disconnecting.
|
||||
if p.disconnect {
|
||||
if atomic.LoadInt32(&p.disconnect) != 0 {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -752,7 +752,7 @@ func (p *peer) readMessage() (msg btcwire.Message, buf []byte, err error) {
|
|||
// writeMessage sends a bitcoin Message to the peer with logging.
|
||||
func (p *peer) writeMessage(msg btcwire.Message) {
|
||||
// Don't do anything if we're disconnecting.
|
||||
if p.disconnect == true {
|
||||
if atomic.LoadInt32(&p.disconnect) != 0 {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -812,7 +812,7 @@ func (p *peer) isAllowedByRegression(err error) bool {
|
|||
// goroutine.
|
||||
func (p *peer) inHandler() {
|
||||
out:
|
||||
for !p.disconnect {
|
||||
for atomic.LoadInt32(&p.disconnect) == 0 {
|
||||
rmsg, buf, err := p.readMessage()
|
||||
if err != nil {
|
||||
// In order to allow regression tests with malformed
|
||||
|
@ -826,7 +826,7 @@ out:
|
|||
}
|
||||
|
||||
// Only log the error if we're not forcibly disconnecting.
|
||||
if !p.disconnect {
|
||||
if atomic.LoadInt32(&p.disconnect) == 0 {
|
||||
log.Errorf("[PEER] Can't read message: %v", err)
|
||||
}
|
||||
break out
|
||||
|
@ -890,7 +890,7 @@ out:
|
|||
|
||||
// Mark the address as currently connected and working as of
|
||||
// now if one of the messages that trigger
|
||||
if markConnected && !p.disconnect {
|
||||
if markConnected && atomic.LoadInt32(&p.disconnect) == 0 {
|
||||
if p.na == nil {
|
||||
log.Warnf("we're getting stuff before we " +
|
||||
"got a version message. that's bad")
|
||||
|
@ -905,9 +905,7 @@ out:
|
|||
p.Disconnect()
|
||||
p.server.donePeers <- p
|
||||
p.server.blockManager.DonePeer(p)
|
||||
p.quit <- true
|
||||
|
||||
p.wg.Done()
|
||||
log.Tracef("[PEER] Peer input handler done for %s", p.conn.RemoteAddr())
|
||||
}
|
||||
|
||||
|
@ -928,7 +926,8 @@ out:
|
|||
case <-trickleTicker.C:
|
||||
// Don't send anything if we're disconnecting or there
|
||||
// is no queued inventory.
|
||||
if p.disconnect || p.invSendQueue.Len() == 0 {
|
||||
if atomic.LoadInt32(&p.disconnect) != 0 ||
|
||||
p.invSendQueue.Len() == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -962,7 +961,6 @@ out:
|
|||
break out
|
||||
}
|
||||
}
|
||||
p.wg.Done()
|
||||
log.Tracef("[PEER] Peer output handler done for %s", p.conn.RemoteAddr())
|
||||
}
|
||||
|
||||
|
@ -1011,7 +1009,6 @@ func (p *peer) Start() error {
|
|||
// Start processing input and output.
|
||||
go p.inHandler()
|
||||
go p.outHandler()
|
||||
p.wg.Add(2)
|
||||
p.started = true
|
||||
|
||||
return nil
|
||||
|
@ -1020,7 +1017,11 @@ func (p *peer) Start() error {
|
|||
// Disconnect disconnects the peer by closing the connection. It also sets
|
||||
// a flag so the impending shutdown can be detected.
|
||||
func (p *peer) Disconnect() {
|
||||
p.disconnect = true
|
||||
// did we win the race?
|
||||
if atomic.AddInt32(&p.disconnect, 1) != 1 {
|
||||
return
|
||||
}
|
||||
close(p.quit)
|
||||
if p.conn != nil {
|
||||
p.conn.Close()
|
||||
}
|
||||
|
@ -1031,7 +1032,6 @@ func (p *peer) Disconnect() {
|
|||
func (p *peer) Shutdown() {
|
||||
log.Tracef("[PEER] Shutdown peer %s", p.addr)
|
||||
p.Disconnect()
|
||||
p.wg.Wait()
|
||||
}
|
||||
|
||||
// newPeerBase returns a new base bitcoin peer for the provided server and
|
||||
|
@ -1098,7 +1098,6 @@ func newOutboundPeer(s *server, addr string, persistent bool) *peer {
|
|||
}
|
||||
p.na = btcwire.NewNetAddressIPPort(net.ParseIP(ip), uint16(port), 0)
|
||||
|
||||
p.wg.Add(1)
|
||||
go func() {
|
||||
// Select which dial method to call depending on whether or
|
||||
// not a proxy is configured. Also, add proxy information to
|
||||
|
@ -1118,7 +1117,7 @@ func newOutboundPeer(s *server, addr string, persistent bool) *peer {
|
|||
// Attempt to connect to the peer. If the connection fails and
|
||||
// this is a persistent connection, retry after the retry
|
||||
// interval.
|
||||
for !s.shutdown {
|
||||
for atomic.LoadInt32(&p.disconnect) == 0 {
|
||||
log.Debugf("[SRVR] Attempting to connect to %s", faddr)
|
||||
conn, err := dial("tcp", addr)
|
||||
if err != nil {
|
||||
|
@ -1127,7 +1126,6 @@ func newOutboundPeer(s *server, addr string, persistent bool) *peer {
|
|||
faddr, err)
|
||||
if !persistent {
|
||||
p.server.donePeers <- p
|
||||
p.wg.Done()
|
||||
return
|
||||
}
|
||||
scaledInterval := connectionRetryInterval.Nanoseconds() * p.retrycount / 2
|
||||
|
@ -1141,7 +1139,7 @@ func newOutboundPeer(s *server, addr string, persistent bool) *peer {
|
|||
// While we were sleeping trying to connect, the server
|
||||
// may have scheduled a shutdown. In that case ditch
|
||||
// the peer immediately.
|
||||
if !s.shutdown {
|
||||
if atomic.LoadInt32(&p.disconnect) == 0 {
|
||||
p.server.addrManager.Attempt(p.na)
|
||||
|
||||
// Connection was successful so log it and start peer.
|
||||
|
@ -1149,14 +1147,8 @@ func newOutboundPeer(s *server, addr string, persistent bool) *peer {
|
|||
p.conn = conn
|
||||
p.retrycount = 0
|
||||
p.Start()
|
||||
} else {
|
||||
p.server.donePeers <- p
|
||||
}
|
||||
|
||||
// We are done here, Start() will have grabbed
|
||||
// additional waitgroup entries if we are not shutting
|
||||
// down.
|
||||
p.wg.Done()
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
|
36
server.go
36
server.go
|
@ -12,6 +12,7 @@ import (
|
|||
"net"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
|
@ -48,8 +49,8 @@ type server struct {
|
|||
listeners []net.Listener
|
||||
btcnet btcwire.BitcoinNet
|
||||
started bool
|
||||
shutdown bool
|
||||
shutdownSched bool
|
||||
shutdown int32 // atomic
|
||||
shutdownSched int32 // atomic
|
||||
addrManager *AddrManager
|
||||
rpcServer *rpcServer
|
||||
blockManager *blockManager
|
||||
|
@ -73,7 +74,7 @@ func (s *server) handleAddPeerMsg(peers *list.List, banned map[string]time.Time,
|
|||
|
||||
// Ignore new peers if we're shutting down.
|
||||
direction := directionString(p.inbound)
|
||||
if s.shutdown {
|
||||
if atomic.LoadInt32(&s.shutdown) != 0 {
|
||||
log.Infof("[SRVR] New peer %s (%s) ignored - server is "+
|
||||
"shutting down", p.addr, direction)
|
||||
p.Shutdown()
|
||||
|
@ -130,7 +131,8 @@ func (s *server) handleDonePeerMsg(peers *list.List, p *peer) bool {
|
|||
|
||||
// Issue an asynchronous reconnect if the peer was a
|
||||
// persistent outbound connection.
|
||||
if !p.inbound && p.persistent && !s.shutdown {
|
||||
if !p.inbound && p.persistent &&
|
||||
atomic.LoadInt32(&s.shutdown) == 0 {
|
||||
// attempt reconnect.
|
||||
addr := p.addr
|
||||
e.Value = newOutboundPeer(s, addr, true)
|
||||
|
@ -207,11 +209,11 @@ func (s *server) handleBroadcastMsg(peers *list.List, bmsg *broadcastMsg) {
|
|||
// server. It must be run as a goroutine.
|
||||
func (s *server) listenHandler(listener net.Listener) {
|
||||
log.Infof("[SRVR] Server listening on %s", listener.Addr())
|
||||
for !s.shutdown {
|
||||
for atomic.LoadInt32(&s.shutdown) == 0 {
|
||||
conn, err := listener.Accept()
|
||||
if err != nil {
|
||||
// Only log the error if we're not forcibly shutting down.
|
||||
if !s.shutdown {
|
||||
if atomic.LoadInt32(&s.shutdown) == 0 {
|
||||
log.Errorf("[SRVR] %v", err)
|
||||
}
|
||||
continue
|
||||
|
@ -295,8 +297,8 @@ func (s *server) peerHandler() {
|
|||
// if nothing else happens, wake us up soon.
|
||||
time.AfterFunc(10*time.Second, func() { s.wakeup <- true })
|
||||
|
||||
// Live while we're not shutting down or there are still connected peers.
|
||||
for !s.shutdown || peers.Len() != 0 {
|
||||
out:
|
||||
for {
|
||||
select {
|
||||
// New peers connected to the server.
|
||||
case p := <-s.newPeers:
|
||||
|
@ -336,6 +338,7 @@ func (s *server) peerHandler() {
|
|||
p := e.Value.(*peer)
|
||||
p.Shutdown()
|
||||
}
|
||||
break out
|
||||
}
|
||||
|
||||
// Timer was just to make sure we woke up again soon. so cancel
|
||||
|
@ -346,7 +349,8 @@ func (s *server) peerHandler() {
|
|||
}
|
||||
|
||||
// Only try connect to more peers if we actually need more
|
||||
if outboundPeers >= maxOutbound || s.shutdown {
|
||||
if outboundPeers >= maxOutbound ||
|
||||
atomic.LoadInt32(&s.shutdown) != 0 {
|
||||
continue
|
||||
}
|
||||
groups := make(map[string]int)
|
||||
|
@ -359,7 +363,8 @@ func (s *server) peerHandler() {
|
|||
|
||||
tries := 0
|
||||
for outboundPeers < maxOutbound &&
|
||||
peers.Len() < cfg.MaxPeers && !s.shutdown {
|
||||
peers.Len() < cfg.MaxPeers &&
|
||||
atomic.LoadInt32(&s.shutdown) == 0 {
|
||||
// We bias like bitcoind does, 10 for no outgoing
|
||||
// up to 90 (8) for the selection of new vs tried
|
||||
//addresses.
|
||||
|
@ -494,16 +499,16 @@ func (s *server) Start() {
|
|||
// Stop gracefully shuts down the server by stopping and disconnecting all
|
||||
// peers and the main listener.
|
||||
func (s *server) Stop() error {
|
||||
if s.shutdown {
|
||||
// Make sure this only happens once.
|
||||
if atomic.AddInt32(&s.shutdown, 1) != 1 {
|
||||
log.Infof("[SRVR] Server is already in the process of shutting down")
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Warnf("[SRVR] Server shutting down")
|
||||
|
||||
// Set the shutdown flag and stop all the listeners. There will not be
|
||||
// any listeners if listening is disabled.
|
||||
s.shutdown = true
|
||||
// Stop all the listeners. There will not be any listeners if
|
||||
// listening is disabled.
|
||||
for _, listener := range s.listeners {
|
||||
err := listener.Close()
|
||||
if err != nil {
|
||||
|
@ -532,7 +537,7 @@ func (s *server) WaitForShutdown() {
|
|||
// on remaining duration.
|
||||
func (s *server) ScheduleShutdown(duration time.Duration) {
|
||||
// Don't schedule shutdown more than once.
|
||||
if s.shutdownSched {
|
||||
if atomic.AddInt32(&s.shutdownSched, 1) != 1 {
|
||||
return
|
||||
}
|
||||
log.Warnf("[SRVR] Server shutdown in %v", duration)
|
||||
|
@ -565,7 +570,6 @@ func (s *server) ScheduleShutdown(duration time.Duration) {
|
|||
}
|
||||
}
|
||||
}()
|
||||
s.shutdownSched = true
|
||||
}
|
||||
|
||||
// newServer returns a new btcd server configured to listen on addr for the
|
||||
|
|
Loading…
Add table
Reference in a new issue