Initial code to actually remove slots when parting a node.
authorAndres Freund <andres@anarazel.de>
Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
committerAndres Freund <andres@anarazel.de>
Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
This removes a slot when
a) It is of a remote node that has been parted.
b) We have been parted, and there are remote slots. It is not guaranteed
   that we're notified of that case, but it seems worthwhile to handle
   nonetheless.

bdr_perdb.c
expected/part_bdr.out
sql/part_bdr.sql

index 736e154760fafbdaf8fa18679ec67001d794251c..438a39ee642f699772e8c0b0afe66abd4a218be3 100644 (file)
@@ -384,8 +384,62 @@ bdr_maintain_db_workers(void)
        }
        else
        {
-           /* Drop slots of dead node */
-           elog(LOG, "need to drop slots for remote (a,b,c)");
+           List *drop = NIL;
+           ListCell *dc;
+           bool we_were_dropped;
+           NameData slot_name_dropped; /* slot of the dropped node */
+
+           /* if a remote node (got) parted, we can easily drop their slot */
+           bdr_slot_name(&slot_name_dropped,
+                         node_sysid, node_timeline, node_datoid,
+                         MyDatabaseId);
+
+           we_were_dropped = node_sysid == GetSystemIdentifier() &&
+               node_timeline == ThisTimeLineID &&
+               node_datoid == MyDatabaseId;
+
+           LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+           for (i = 0; i < max_replication_slots; i++)
+           {
+               ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+
+               if (!s->in_use)
+                   continue;
+
+               if (strcmp("bdr", NameStr(s->data.plugin)) != 0)
+                   continue;
+
+               if (we_were_dropped &&
+                   s->data.database == MyDatabaseId)
+               {
+                   elog(LOG, "need to drop slot %s as we got parted",
+                        NameStr(s->data.name));
+                   drop = lappend(drop, pstrdup(NameStr(s->data.name)));
+               }
+
+               if (strcmp(NameStr(s->data.name),
+                          NameStr(slot_name_dropped)) == 0)
+               {
+                   elog(LOG, "need to drop slot %s of dropped node",
+                        NameStr(s->data.name));
+                   drop = lappend(drop, pstrdup(NameStr(s->data.name)));
+               }
+           }
+           LWLockRelease(ReplicationSlotControlLock);
+
+           foreach(dc, drop)
+           {
+               char *slot_name = (char *) lfirst(dc);
+               elog(LOG, "dropping slot %s due to node part", slot_name);
+               ReplicationSlotDrop(slot_name);
+               elog(LOG, "dropped slot %s due to node part", slot_name);
+           }
+
+           /*
+            * TODO: It'd be a good idea to set the slot to dead (in contrast
+            * to being killed) here. That way we wouldn't constantly rescan
+            * killed nodes.
+            */
        }
 
        LWLockRelease(BdrWorkerCtl->lock);
index ac0351d1251f6af977b76f2c02c5918ebb8705a2..18b484095b64027a725f2f7262abb0eda3dd3800 100644 (file)
@@ -1,24 +1,22 @@
-\c postgres
+\c regression
 SELECT bdr.bdr_part_by_node_names(ARRAY['node-pg']);
  bdr_part_by_node_names 
 ------------------------
  
 (1 row)
 
-SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
- pg_xlog_wait_remote_apply 
----------------------------
+-- wait till all slots are killed, we need a better way for that.
+SELECT pg_sleep(1);
+ pg_sleep 
+----------
  
 (1 row)
 
-\c regression
 -- There should now be zero slots
-SELECT plugin, slot_type, database, active FROM pg_replication_slots;
- plugin | slot_type |  database  | active 
---------+-----------+------------+--------
- bdr    | logical   | postgres   | f
- bdr    | logical   | regression | f
-(2 rows)
+SELECT * FROM pg_replication_slots;
+ slot_name | plugin | slot_type | datoid | database | active | xmin | catalog_xmin | restart_lsn 
+-----------+--------+-----------+--------+----------+--------+------+--------------+-------------
+(0 rows)
 
 -- Zero active connections
 SELECT count(*) FROM pg_stat_replication;
index b2a9711797edae19446df17bb628d980657dbcc1..234202924bd0dfeb077edc3e030b5656e8b3ece0 100644 (file)
@@ -1,12 +1,11 @@
-\c postgres
+\c regression
 SELECT bdr.bdr_part_by_node_names(ARRAY['node-pg']);
 
-SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
-
-\c regression
+-- wait till all slots are killed, we need a better way for that.
+SELECT pg_sleep(1);
 
 -- There should now be zero slots
-SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT * FROM pg_replication_slots;
 -- Zero active connections
 SELECT count(*) FROM pg_stat_replication;
 -- and the node state for the removed node should show 'k'