Назад | Перейти на главную страницу

DRBD + corosync + pacemaker - серверы не подключаются после аварийного переключения

Я получил 2 сервера, настроенных с помощью drbd, corosync с кардиостимулятором. Все работает нормально, первичный сервер становится вторичным, если он выходит из строя, а вторичный становится первичным.

Но если 1-й сервер снова появится после аварийного переключения, drbd не будет подключаться снова, и мне придется выполнить

drbdadm подключить все

на теперь вторичном.

Вот журнал с server1 со всеми введенными мной данными ( https://www.refheap.com/95997 ):

root@server1:~# ifdown eth4


Jan 12 08:11:37 server1 kernel: [237958.935578] ixgbe 0000:04:00.0: removed PHC on eth4
Jan 12 08:11:43 server1 kernel: [237965.430849] cib[2877]: segfault at 0 ip 00007f7d342c8aca sp 00007fffaba39e58 error 4 in libc-2.19.so[7f7d34247000+19f000]
Jan 12 08:11:44 server1 kernel: [237966.427423] crmd[2878]: segfault at 0 ip 00007eff7d291aca sp 00007fffaae51538 error 4 in libc-2.19.so[7eff7d210000+19f000]
Jan 12 08:11:46 server1 kernel: [237967.955014] drbd r0: peer( Secondary -> Unknown ) conn( Connected -> NetworkFailure ) pdsk( UpToDate -> DUnknown )
Jan 12 08:11:46 server1 kernel: [237967.955124] block drbd1: new current UUID 913B80AB1E998111:B917764AA4AA3175:A9C5E158DC3CB036:A9C4E158DC3CB036
Jan 12 08:11:46 server1 kernel: [237967.955133] drbd r0: asender terminated
Jan 12 08:11:46 server1 kernel: [237967.955137] drbd r0: Terminating drbd_a_r0
Jan 12 08:11:46 server1 kernel: [237967.955278] drbd r0: Connection closed
Jan 12 08:11:46 server1 kernel: [237967.955418] drbd r0: conn( NetworkFailure -> Unconnected )
Jan 12 08:11:46 server1 kernel: [237967.955420] drbd r0: receiver terminated
Jan 12 08:11:46 server1 kernel: [237967.955423] drbd r0: Restarting receiver thread
Jan 12 08:11:46 server1 kernel: [237967.955424] drbd r0: receiver (re)started
Jan 12 08:11:46 server1 kernel: [237967.955442] drbd r0: conn( Unconnected -> WFConnection )
Jan 12 08:11:46 server1 kernel: [237967.955757] drbd r0: conn( WFConnection -> Disconnecting )
Jan 12 08:11:46 server1 kernel: [237967.955809] drbd r0: Connection closed
Jan 12 08:11:46 server1 kernel: [237967.955931] drbd r0: conn( Disconnecting -> StandAlone )
Jan 12 08:11:47 server1 kernel: [237968.955403] drbd r0: receiver terminated
Jan 12 08:11:47 server1 kernel: [237968.955405] drbd r0: Terminating drbd_r_r0

root@server1:~# cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown   r-----
    ns:36 nr:32 dw:136 dr:15337 al:3 bm:8 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0


root@server1:~# crm status
Last updated: Mon Jan 12 08:12:13 2015
Last change: Fri Jan  9 15:37:31 2015 via crmd on server1
Stack: classic openais (with plugin)
Current DC: server1 - partition WITHOUT quorum
Version: 1.1.10-42f2063
2 Nodes configured, 2 expected votes
3 Resources configured


Online: [ server1 ]
OFFLINE: [ server2 ]

 Master/Slave Set: masterdrbd [drbd]
     Masters: [ server1 ]
     Stopped: [ server2 ]
 Resource Group: complete_start
     drbd_mount (ocf::heartbeat:Filesystem):    Started server1





Jan 12 08:12:23 server1 kernel: [238005.311159] ixgbe 0000:04:00.0: registered PHC device on eth4
Jan 12 08:12:23 server1 kernel: [238005.413333] IPv6: ADDRCONF(NETDEV_UP): eth4: link is not ready
Jan 12 08:12:23 server1 kernel: [238005.477077] ixgbe 0000:04:00.0 eth4: detected SFP+: 6
Jan 12 08:12:24 server1 kernel: [238005.971742] ixgbe 0000:04:00.0 eth4: NIC Link is Up 10 Gbps, Flow Control: RX/TX
Jan 12 08:12:24 server1 kernel: [238005.971957] IPv6: ADDRCONF(NETDEV_CHANGE): eth4: link becomes ready
Jan 12 08:12:27 server1 kernel: [238009.617680] cib[4299]: segfault at 0 ip 00007f8d74805aca sp 00007fffd2ccd9d8 error 4 in libc-2.19.so[7f8d74784000+19f000]
Jan 12 08:12:28 server1 kernel: [238010.612151] crmd[4309]: segfault at 0 ip 00007fb37c610aca sp 00007fff3c4fd5e8 error 4 in libc-2.19.so[7fb37c58f000+19f000]
Jan 12 08:12:33 server1 kernel: [238014.719195] block drbd1: role( Primary -> Secondary )
Jan 12 08:12:33 server1 kernel: [238014.720047] block drbd1: bitmap WRITE of 2 pages took 0 jiffies
Jan 12 08:12:33 server1 kernel: [238014.720058] block drbd1: 8 KB (2 bits) marked out-of-sync by on disk bit-map.



root@server1:~# ifup eth4
root@server1:~# crm status
Last updated: Mon Jan 12 08:12:45 2015
Last change: Fri Jan  9 13:56:26 2015 via crmd on server1
Stack: classic openais (with plugin)
Current DC: server2 - partition with quorum
Version: 1.1.10-42f2063
2 Nodes configured, 2 expected votes
3 Resources configured


Online: [ server1 server2 ]

 Master/Slave Set: masterdrbd [drbd]
     Masters: [ server2 ]
     Slaves: [ server1 ]
 Resource Group: complete_start
     drbd_mount (ocf::heartbeat:Filesystem):    Started server2


root@server1:~# cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:StandAlone ro:Secondary/Unknown ds:UpToDate/DUnknown   r-----
    ns:36 nr:32 dw:148 dr:15337 al:3 bm:8 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:8



root@server1:~# drbdadm connect all


Jan 12 08:13:14 server1 kernel: [238055.707157] drbd r0: conn( StandAlone -> Unconnected )
Jan 12 08:13:14 server1 kernel: [238055.707190] drbd r0: Starting receiver thread (from drbd_w_r0 [981])
Jan 12 08:13:14 server1 kernel: [238055.707296] drbd r0: receiver (re)started
Jan 12 08:13:14 server1 kernel: [238055.707316] drbd r0: conn( Unconnected -> WFConnection )
Jan 12 08:13:14 server1 kernel: [238056.205606] drbd r0: Handshake successful: Agreed network protocol version 101
Jan 12 08:13:14 server1 kernel: [238056.205613] drbd r0: Agreed to support TRIM on protocol level
Jan 12 08:13:14 server1 kernel: [238056.205656] drbd r0: conn( WFConnection -> WFReportParams )
Jan 12 08:13:14 server1 kernel: [238056.205672] drbd r0: Starting asender thread (from drbd_r_r0 [4701])
Jan 12 08:13:14 server1 kernel: [238056.260475] block drbd1: drbd_sync_handshake:
Jan 12 08:13:14 server1 kernel: [238056.260483] block drbd1: self 913B80AB1E998110:B917764AA4AA3175:A9C5E158DC3CB036:A9C4E158DC3CB036 bits:2 flags:0
Jan 12 08:13:14 server1 kernel: [238056.260487] block drbd1: peer F8EB7C91BB3D8B5D:B917764AA4AA3174:A9C5E158DC3CB036:A9C4E158DC3CB036 bits:3 flags:0
Jan 12 08:13:14 server1 kernel: [238056.260490] block drbd1: uuid_compare()=100 by rule 90
Jan 12 08:13:14 server1 kernel: [238056.260497] block drbd1: helper command: /sbin/drbdadm initial-split-brain minor-1
Jan 12 08:13:14 server1 kernel: [238056.262654] block drbd1: helper command: /sbin/drbdadm initial-split-brain minor-1 exit code 0 (0x0)
Jan 12 08:13:14 server1 kernel: [238056.262680] block drbd1: Split-Brain detected, 1 primaries, automatically solved. Sync from peer node
Jan 12 08:13:14 server1 kernel: [238056.262688] block drbd1: peer( Unknown -> Primary ) conn( WFReportParams -> WFBitMapT ) disk( UpToDate -> Outdated ) pdsk( DUnknown -> UpToDate )
Jan 12 08:13:14 server1 kernel: [238056.321303] block drbd1: receive bitmap stats [Bytes(packets)]: plain 0(0), RLE 31(1), total 31; compression: 100.0%
Jan 12 08:13:14 server1 kernel: [238056.374084] block drbd1: send bitmap stats [Bytes(packets)]: plain 0(0), RLE 31(1), total 31; compression: 100.0%
Jan 12 08:13:14 server1 kernel: [238056.374100] block drbd1: conn( WFBitMapT -> WFSyncUUID )
Jan 12 08:13:14 server1 kernel: [238056.376960] block drbd1: updated sync uuid B918764AA4AA3174:0000000000000000:A9C5E158DC3CB036:A9C4E158DC3CB036
Jan 12 08:13:14 server1 kernel: [238056.377094] block drbd1: helper command: /sbin/drbdadm before-resync-target minor-1
Jan 12 08:13:14 server1 kernel: [238056.379576] block drbd1: helper command: /sbin/drbdadm before-resync-target minor-1 exit code 0 (0x0)
Jan 12 08:13:14 server1 kernel: [238056.379603] block drbd1: conn( WFSyncUUID -> SyncTarget ) disk( Outdated -> Inconsistent )
Jan 12 08:13:14 server1 kernel: [238056.379616] block drbd1: Began resync as SyncTarget (will sync 12 KB [3 bits set]).
Jan 12 08:13:14 server1 kernel: [238056.438399] block drbd1: Resync done (total 1 sec; paused 0 sec; 12 K/sec)
Jan 12 08:13:14 server1 kernel: [238056.438410] block drbd1: updated UUIDs F8EB7C91BB3D8B5C:0000000000000000:B918764AA4AA3174:B917764AA4AA3174
Jan 12 08:13:14 server1 kernel: [238056.438418] block drbd1: conn( SyncTarget -> Connected ) disk( Inconsistent -> UpToDate )
Jan 12 08:13:14 server1 kernel: [238056.438506] block drbd1: helper command: /sbin/drbdadm after-resync-target minor-1
Jan 12 08:13:14 server1 kernel: [238056.440992] block drbd1: helper command: /sbin/drbdadm after-resync-target minor-1 exit code 0 (0x0)


root@server1:~# cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r-----
    ns:0 nr:12 dw:160 dr:15337 al:3 bm:10 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0

и журнал server2 с входами ( https://www.refheap.com/95998 ):

root@server1:~# ifdown eth4



Jan 12 08:03:47 server2 kernel: [234819.924834] ixgbe 0000:04:00.0 eth4: NIC Link is Down
Jan 12 08:03:55 server2 kernel: [234827.977628] drbd r0: peer( Primary -> Unknown ) conn( Connected -> NetworkFailure ) pdsk( UpToDate -> DUnknown )
Jan 12 08:03:55 server2 kernel: [234827.977750] drbd r0: asender terminated
Jan 12 08:03:55 server2 kernel: [234827.977754] drbd r0: Terminating drbd_a_r0
Jan 12 08:03:55 server2 kernel: [234827.977831] drbd r0: Connection closed
Jan 12 08:03:55 server2 kernel: [234827.977849] drbd r0: conn( NetworkFailure -> Unconnected )
Jan 12 08:03:55 server2 kernel: [234827.977852] drbd r0: receiver terminated
Jan 12 08:03:55 server2 kernel: [234827.977854] drbd r0: Restarting receiver thread
Jan 12 08:03:55 server2 kernel: [234827.977856] drbd r0: receiver (re)started
Jan 12 08:03:55 server2 kernel: [234827.977869] drbd r0: conn( Unconnected -> WFConnection )
Jan 12 08:03:56 server2 kernel: [234828.077287] block drbd1: role( Secondary -> Primary )
Jan 12 08:03:56 server2 kernel: [234828.077463] block drbd1: new current UUID F8EB7C91BB3D8B5D:B917764AA4AA3174:A9C5E158DC3CB036:A9C4E158DC3CB036
Jan 12 08:03:56 server2 kernel: [234828.359819] EXT4-fs (drbd1): recovery complete
Jan 12 08:03:56 server2 kernel: [234828.359942] EXT4-fs (drbd1): mounted filesystem with ordered data mode. Opts: (null)


root@server2:~#  cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:WFConnection ro:Primary/Unknown ds:UpToDate/DUnknown C r-----
    ns:0 nr:36 dw:4224 dr:6959 al:3 bm:5 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:12
root@server2:~# crm status
Last updated: Mon Jan 12 08:04:12 2015
Last change: Fri Jan  9 13:56:26 2015 via crmd on server1
Stack: classic openais (with plugin)
Current DC: server2 - partition WITHOUT quorum
Version: 1.1.10-42f2063
2 Nodes configured, 2 expected votes
3 Resources configured


Online: [ server2 ]
OFFLINE: [ server1 ]

 Master/Slave Set: masterdrbd [drbd]
     Masters: [ server2 ]
     Stopped: [ server1 ]
 Resource Group: complete_start
     drbd_mount (ocf::heartbeat:Filesystem):    Started server2

root@server1:~# ifup eth4

Jan 12 08:04:34 server2 kernel: [234866.881710] ixgbe 0000:04:00.0 eth4: NIC Link is Up 10 Gbps, Flow Control: RX/TX
Jan 12 08:04:43 server2 kernel: [234875.776456] EXT4-fs (drbd1): mounted filesystem with ordered data mode. Opts: (null)


root@server2:~#  cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:WFConnection ro:Primary/Unknown ds:UpToDate/DUnknown C r-----
    ns:0 nr:36 dw:4232 dr:8624 al:3 bm:5 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:12


root@server1:~# drbdadm connect all


Jan 12 08:05:25 server2 kernel: [234917.115735] drbd r0: Handshake successful: Agreed network protocol version 101
Jan 12 08:05:25 server2 kernel: [234917.115746] drbd r0: Agreed to support TRIM on protocol level
Jan 12 08:05:25 server2 kernel: [234917.115801] drbd r0: conn( WFConnection -> WFReportParams )
Jan 12 08:05:25 server2 kernel: [234917.115807] drbd r0: Starting asender thread (from drbd_r_r0 [2322])
Jan 12 08:05:25 server2 kernel: [234917.170741] block drbd1: drbd_sync_handshake:
Jan 12 08:05:25 server2 kernel: [234917.170749] block drbd1: self F8EB7C91BB3D8B5D:B917764AA4AA3174:A9C5E158DC3CB036:A9C4E158DC3CB036 bits:3 flags:0
Jan 12 08:05:25 server2 kernel: [234917.170753] block drbd1: peer 913B80AB1E998110:B917764AA4AA3175:A9C5E158DC3CB036:A9C4E158DC3CB036 bits:2 flags:0
Jan 12 08:05:25 server2 kernel: [234917.170756] block drbd1: uuid_compare()=100 by rule 90
Jan 12 08:05:25 server2 kernel: [234917.170763] block drbd1: helper command: /sbin/drbdadm initial-split-brain minor-1
Jan 12 08:05:25 server2 kernel: [234917.173313] block drbd1: helper command: /sbin/drbdadm initial-split-brain minor-1 exit code 0 (0x0)
Jan 12 08:05:25 server2 kernel: [234917.173334] block drbd1: Split-Brain detected, 1 primaries, automatically solved. Sync from this node
Jan 12 08:05:25 server2 kernel: [234917.173345] block drbd1: peer( Unknown -> Secondary ) conn( WFReportParams -> WFBitMapS ) pdsk( DUnknown -> Consistent )
Jan 12 08:05:25 server2 kernel: [234917.231440] block drbd1: send bitmap stats [Bytes(packets)]: plain 0(0), RLE 31(1), total 31; compression: 100.0%
Jan 12 08:05:25 server2 kernel: [234917.284423] block drbd1: receive bitmap stats [Bytes(packets)]: plain 0(0), RLE 31(1), total 31; compression: 100.0%
Jan 12 08:05:25 server2 kernel: [234917.284433] block drbd1: helper command: /sbin/drbdadm before-resync-source minor-1
Jan 12 08:05:25 server2 kernel: [234917.286897] block drbd1: helper command: /sbin/drbdadm before-resync-source minor-1 exit code 0 (0x0)
Jan 12 08:05:25 server2 kernel: [234917.286926] block drbd1: conn( WFBitMapS -> SyncSource ) pdsk( Consistent -> Inconsistent )
Jan 12 08:05:25 server2 kernel: [234917.286942] block drbd1: Began resync as SyncSource (will sync 12 KB [3 bits set]).
Jan 12 08:05:25 server2 kernel: [234917.287017] block drbd1: updated sync UUID F8EB7C91BB3D8B5D:B918764AA4AA3174:B917764AA4AA3174:A9C5E158DC3CB036
Jan 12 08:05:25 server2 kernel: [234917.348714] block drbd1: Resync done (total 1 sec; paused 0 sec; 12 K/sec)
Jan 12 08:05:25 server2 kernel: [234917.348722] block drbd1: updated UUIDs F8EB7C91BB3D8B5D:0000000000000000:B918764AA4AA3174:B917764AA4AA3174
Jan 12 08:05:25 server2 kernel: [234917.348732] block drbd1: conn( SyncSource -> Connected ) pdsk( Inconsistent -> UpToDate )


root@server2:~#  cat /proc/drbd
version: 8.4.3 (api:1/proto:86-101)
srcversion: 107E17F432EA25ED3AF8929

 1: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r-----
    ns:12 nr:36 dw:4232 dr:8636 al:3 bm:7 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0

Конфигурация на серверах такая:

/etc/drbd.d/r0.res
resource r0 {
        on server1 {
                device /dev/drbd1;
                disk /dev/server1-vg/drbd1;
                address 192.168.1.1:7789;
                meta-disk internal;
        }
        on server2 {
                device /dev/drbd1;
                disk /dev/server2-vg/drbd1;
                address 192.168.1.2:7789;
                meta-disk internal;
        }
#        handlers {
#               split-brain "/usr/lib/drbd/notify-split-brain.sh root";
#       }
        net {
                max-buffers 8000;
                max-epoch-size 8000;
#               sndbuf-size 1024k;
                after-sb-0pri   discard-zero-changes;
                after-sb-1pri   discard-secondary;
                after-sb-2pri   disconnect;
        }
#       disk {
#               resync-rate 4194304;
#       }
#       startup {
#               become-primary-on server1;
#       }
}


/etc/drbd.d/global_common.conf
# DRBD is the result of over a decade of development by LINBIT.
# In case you need professional services for DRBD or have
# feature requests visit http://www.linbit.com

global {
        usage-count no;
        # minor-count dialog-refresh disable-ip-verification
}

common {
#       syncer {
#               rate 1150M;
#       }
        handlers {
                # These are EXAMPLE handlers only.
                # They may have severe implications,
                # like hard resetting the node under certain circumstances.
                # Be careful when chosing your poison.

                # pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
                # pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
                # local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
                # fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
                # split-brain "/usr/lib/drbd/notify-split-brain.sh root";
                # out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
                # before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
                # after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
        }

        startup {
                # wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb
        }

        options {
                # cpu-mask on-no-data-accessible
        }

        disk {
                # size on-io-error fencing disk-barrier disk-flushes
                # disk-drain md-flushes resync-rate resync-after al-extents
                # c-plan-ahead c-delay-target c-fill-target c-max-rate
                # c-min-rate disk-timeout
        }

        net {
                # protocol timeout max-epoch-size max-buffers unplug-watermark
                # connect-int ping-int sndbuf-size rcvbuf-size ko-count
                # allow-two-primaries cram-hmac-alg shared-secret after-sb-0pri
                # after-sb-1pri after-sb-2pri always-asbp rr-conflict
                # ping-timeout data-integrity-alg tcp-cork on-congestion
                # congestion-fill congestion-extents csums-alg verify-alg
                # use-rle
        }
}



crm configure show
node server1
node server2
primitive drbd ocf:linbit:drbd \
        params drbd_resource="r0"
primitive drbd_mount ocf:heartbeat:Filesystem \
        params device="/dev/drbd1" directory="/drbd" fstype="ext4"
group complete_start drbd_mount
ms masterdrbd drbd \
        meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true"
colocation mountDRBD inf: complete_start masterdrbd:Master
order mouten_danach inf: masterdrbd:promote complete_start:start
property $id="cib-bootstrap-options" \
        dc-version="1.1.10-42f2063" \
        cluster-infrastructure="classic openais (with plugin)" \
        expected-quorum-votes="2" \
        stonith-enabled="false" \
        no-quorum-policy="ignore" \
        default-resource-stickiness="100" \
        last-lrm-refresh="1420808058"

Я что-то забыл в своей конфигурации? Я не хочу делать

drbdadm подключить все

на серверах в случае сбоя. Вот почему мне нужна высокая доступность. Не могли бы вы мне помочь? Я не знаю, что не так в моей конфигурации.

Какой дистрибутив вы используете?

Убедитесь, что у вас не установлен DRBD для запуска при загрузке:

  • CentOS / RHEL 6: chkconfig drbd off
  • CentOS / RHEL 7: systemctl disable drbd
  • Ubuntu / Debian: update-rc.d drbd disable

Если он не настроен на запуск при загрузке, возможно, ваша сеть отключается до полной остановки Pacemaker (я видел это на кластерах Ubuntu).

Проверьте журналы на наличие сообщений о том, что сеть выходит из строя, прежде чем Corosync / Pacemaker завершит остановку ресурсов.

Или просто убедитесь, что в сценарии инициализации Pacemaker указан правильный INIT INFO:

# grep -A 10 "BEGIN INIT INFO" /etc/init.d/pacemaker 
  ### BEGIN INIT INFO
  # Provides:             pacemaker
  # Required-Start:       $network $remote_fs corosync
  # Should-Start:         $syslog
  # Required-Stop:        $network $remote_fs corosync
  # Default-Start:
  # Default-Stop:
  # Short-Description:    Starts and stops Pacemaker Cluster Manager.
  # Description:          Starts and stops Pacemaker Cluster Manager.
  ### END INIT INFO

Надеюсь, это поможет найти проблему!