Initial code to actually remove slots when parting a node.

author Andres Freund <andres@anarazel.de>

Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)

committer Andres Freund <andres@anarazel.de>

Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
author Andres Freund <andres@anarazel.de>
Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
committer Andres Freund <andres@anarazel.de>
Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
diff --git a/bdr_perdb.c b/bdr_perdb.c

index 736e154760fafbdaf8fa18679ec67001d794251c..438a39ee642f699772e8c0b0afe66abd4a218be3 100644 (file)
--- a/bdr_perdb.c
+++ b/bdr_perdb.c
@@ -384,8 +384,62 @@ bdr_maintain_db_workers(void)
         }
         else
         {
-           /* Drop slots of dead node */
-           elog(LOG, "need to drop slots for remote (a,b,c)");
+           List *drop = NIL;
+           ListCell *dc;
+           bool we_were_dropped;
+           NameData slot_name_dropped; /* slot of the dropped node */
+
+           /* if a remote node (got) parted, we can easily drop their slot */
+           bdr_slot_name(&slot_name_dropped,
+                         node_sysid, node_timeline, node_datoid,
+                         MyDatabaseId);
+
+           we_were_dropped = node_sysid == GetSystemIdentifier() &&
+               node_timeline == ThisTimeLineID &&
+               node_datoid == MyDatabaseId;
+
+           LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+           for (i = 0; i < max_replication_slots; i++)
+           {
+               ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+
+               if (!s->in_use)
+                   continue;
+
+               if (strcmp("bdr", NameStr(s->data.plugin)) != 0)
+                   continue;
+
+               if (we_were_dropped &&
+                   s->data.database == MyDatabaseId)
+               {
+                   elog(LOG, "need to drop slot %s as we got parted",
+                        NameStr(s->data.name));
+                   drop = lappend(drop, pstrdup(NameStr(s->data.name)));
+               }
+
+               if (strcmp(NameStr(s->data.name),
+                          NameStr(slot_name_dropped)) == 0)
+               {
+                   elog(LOG, "need to drop slot %s of dropped node",
+                        NameStr(s->data.name));
+                   drop = lappend(drop, pstrdup(NameStr(s->data.name)));
+               }
+           }
+           LWLockRelease(ReplicationSlotControlLock);
+
+           foreach(dc, drop)
+           {
+               char *slot_name = (char *) lfirst(dc);
+               elog(LOG, "dropping slot %s due to node part", slot_name);
+               ReplicationSlotDrop(slot_name);
+               elog(LOG, "dropped slot %s due to node part", slot_name);
+           }
+
+           /*
+            * TODO: It'd be a good idea to set the slot to dead (in contrast
+            * to being killed) here. That way we wouldn't constantly rescan
+            * killed nodes.
+            */
         }
  
         LWLockRelease(BdrWorkerCtl->lock);
diff --git a/expected/part_bdr.out b/expected/part_bdr.out

index ac0351d1251f6af977b76f2c02c5918ebb8705a2..18b484095b64027a725f2f7262abb0eda3dd3800 100644 (file)
--- a/expected/part_bdr.out
+++ b/expected/part_bdr.out
@@ -1,24 +1,22 @@
-\c postgres
+\c regression
  SELECT bdr.bdr_part_by_node_names(ARRAY['node-pg']);
   bdr_part_by_node_names 
  ------------------------
   
  (1 row)
  
-SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
- pg_xlog_wait_remote_apply 
----------------------------
+-- wait till all slots are killed, we need a better way for that.
+SELECT pg_sleep(1);
+ pg_sleep 
+----------
   
  (1 row)
  
-\c regression
  -- There should now be zero slots
-SELECT plugin, slot_type, database, active FROM pg_replication_slots;
- plugin | slot_type |  database  | active 
---------+-----------+------------+--------
- bdr    | logical   | postgres   | f
- bdr    | logical   | regression | f
-(2 rows)
+SELECT * FROM pg_replication_slots;
+ slot_name | plugin | slot_type | datoid | database | active | xmin | catalog_xmin | restart_lsn 
+-----------+--------+-----------+--------+----------+--------+------+--------------+-------------
+(0 rows)
  
  -- Zero active connections
  SELECT count(*) FROM pg_stat_replication;
diff --git a/sql/part_bdr.sql b/sql/part_bdr.sql

index b2a9711797edae19446df17bb628d980657dbcc1..234202924bd0dfeb077edc3e030b5656e8b3ece0 100644 (file)
--- a/sql/part_bdr.sql
+++ b/sql/part_bdr.sql
@@ -1,12 +1,11 @@
-\c postgres
+\c regression
  SELECT bdr.bdr_part_by_node_names(ARRAY['node-pg']);
  
-SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), 0);
-
-\c regression
+-- wait till all slots are killed, we need a better way for that.
+SELECT pg_sleep(1);
  
  -- There should now be zero slots
-SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT * FROM pg_replication_slots;
  -- Zero active connections
  SELECT count(*) FROM pg_stat_replication;
  -- and the node state for the removed node should show 'k'
author	Andres Freund <andres@anarazel.de>
	Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
committer	Andres Freund <andres@anarazel.de>
	Fri, 15 May 2015 14:03:01 +0000 (16:03 +0200)
bdr_perdb.c		patch \| blob \| blame \| history
expected/part_bdr.out		patch \| blob \| blame \| history
sql/part_bdr.sql		patch \| blob \| blame \| history