Lacking Natural Simplicity

Random musings on books, code, and tabletop games.

Splitting Strings on a Delimiter in the Ada Programming Language

Last edited: 2022-12-06 13:30:34 EST

When I did a search for “splitting strings on a delimiter in the Ada programming language” recently I did not get many useful results. Eventually I stumbled over GNAT.String_Split which is an instantiation of the generic package GNAT.Array_Split. I also finally found GNATCOLL.Strings_Impl and GNATCOLL.Strings, its default instantiation, which looks especially interesting, contains a split implementation, and which seems to be designed to be a more efficient string implementation than than Ada.Strings.Unbounded.

However, those are all a little complicated, so it might be appropriate to show a simpler implementation.

The String type in Ada is a array of characters. Once declared, a String variable always has the same length. That means that all the strings in an array of strings have to be the same length. However, an access (Ada's version of a pointer) to a String can point to a string of any length, so for this version we'll return an array of pointers to String.

Operations on String are defined in Ada.Strings.Fixed.

with Ada.Strings; use Ada.Strings;
with Ada.Strings.Fixed; use Ada.Strings.Fixed;
with Ada.Text_IO;
with Ada.Integer_Text_IO; use Ada.Integer_Text_IO;
procedure split_fixed is
   -- Ada.Text_IO contains a type, Count, that would conflict with
   -- the function Ada.Strings.Fixed.Count, so don't "use Ada.Text_IO;"
   -- instead, make a package the gives it a shorter name, and use all its
   -- procedures with that as the prefix.
   package ATIO renames Ada.Text_IO;
   type String_Ptr is access String;
   type Vector is array (Natural range <>) of String_Ptr;

   -- Allocate a new String in a storage pool, initializing it to S, and
   -- returning an access to it (a pointer).
   function "+" (Source : in String) return String_Ptr  is
      SP : String_Ptr := new String'(Source);
   begin
      return SP;
   end "+";

   function Split (S: String; Pattern: String) return Vector is
      Start: Positive := 1;
      Position: Natural;
      Num_Parts: Natural := Count (S, Pattern) + 1;
      V : Vector (1.. Num_Parts);
      I : Natural := 0;
   begin
      while Start <= S'Length loop
         Position := Index (S, Pattern, Start);
         exit when Position = 0;
         I := I + 1;
         V (I) := +S(Start..Position-1);
          -- The pattern can be longer than one character.
         Start := Position + Pattern'Length;
      end loop;
      I := I + 1;
      V (I) := +S(Start..S'Last);

      return V;
   end Split;

   procedure Print_Vector (Label: String; S: String; V: Vector) is
      N: Natural := 0;
   begin
      ATIO.Put_Line (Label & ": """ & S & """");
      for I in V'First .. V'Last loop
         N := N + 1;
         ATIO.Put ("    Part "); Put (N, 0); ATIO.Put (": """);
         ATIO.Put (V(I).all);
         ATIO.Put_Line ("""");
      end loop;
   end Print_Vector;

   S1: String := "Hello, World!|I am fine!|How are you?";
   V1: Vector := Split (S1, "|");
   S2: String := "";                    --  Empty string.
   V2: Vector := Split (S2, "|");
   S3: String := "|";                   --  Just one  of pattern.
   V3: Vector := Split (S3, "|");
   S4: String := "||";                  --  Just two of pattern.
   V4: Vector := Split (S4, "|");
   S5: String := "one";                 --  Just one part.
   V5: Vector := Split (S5, "|");
   -- The delimiter doesn't have to be one character.
   S6: String := "foo<=>bar";
   V6: Vector := Split (S6, "<=>");

begin
   Print_Vector ("S1", S1, V1);
   Print_Vector ("S2", S2, V2);
   Print_Vector ("S3", S3, V3);
   Print_Vector ("S4", S4, V4);
   Print_Vector ("S5", S5, V5);
   Print_Vector ("S6", S6, V6);
end split_fixed;

Here's the output:

S1: "Hello, World!|I am fine!|How are you?"
    Part 1: "Hello, World!"
    Part 2: "I am fine!"
    Part 3: "How are you?"
S2: ""
    Part 1: ""
S3: "|"
    Part 1: ""
    Part 2: ""
S4: "||"
    Part 1: ""
    Part 2: ""
    Part 3: ""
S5: "one"
    Part 1: "one"
S6: "foo<=>bar"
    Part 1: "foo"
    Part 2: "bar"

The Bounded_String type in Ada has a maximum capacity and a current length. You instantiate a new package for each different maximum capacity that you want, producing a different type for each. You can assign any string smaller than or equal to the maximum length, and the current length is recorded.

Operations on Bounded_String are defined in Ada.Strings.Bounded.

with Ada.Strings; use Ada.Strings;
with Ada.Strings.Bounded; use Ada.Strings.Bounded;
with Ada.Text_IO.Bounded_IO;
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Integer_Text_IO; use Ada.Integer_Text_IO;
procedure split_bounded is
   package B_String is new
     Ada.Strings.Bounded.Generic_Bounded_Length (Max => 128);
   use B_String;
   package B_String_IO is new Bounded_IO (B_String); use B_String_IO;

   type Vector is array (Natural range <>) of Bounded_String;

   function Split (S: Bounded_String; Pattern: String)
                  return Vector is
      Start: Positive := 1;
      Position: Natural;
      Num_Parts: Natural := B_String.Count (S, Pattern) + 1;
      V : Vector (1 .. Num_Parts);
      I : Natural := 0;
   begin
      while Start <= Length (S) loop
         Position := Index (S, Pattern, Start);
         exit when Position = 0;
         I := I + 1;
         V (I) := Bounded_Slice (S, Start, Position - 1);
          -- The pattern can be longer than one character.
         Start := Position + Pattern'Length;
      end loop;
      I := I + 1;
      V (I) := Bounded_Slice (S, Start, Length (S));

      return V;
   end Split;

   procedure Print_Vector (Label: String; S: Bounded_String; V: Vector) is
      N : Natural := 0;
   begin
      Put_Line (label & ": """ & S & """");
      for I in V'First .. V'Last loop
         N := N + 1;
         Put ("    Part "); Put (N, 0); Put (": """); Put (V(I));
         Put_Line ("""");
      end loop;
   end Print_Vector;

   S1: Bounded_String := To_Bounded_String ("Hello, World!|I am fine!|How are you?");
   V1: Vector := Split (S1, "|");
   S2: Bounded_String := To_Bounded_String ("");      --  Empty string.
   V2: Vector := Split (S2, "|");
   S3: Bounded_String := To_Bounded_String ("|");     --  Just one  of pattern.
   V3: Vector := Split (S3, "|");
   S4: Bounded_String := To_Bounded_String ("||");    --  Just two of pattern.
   V4: Vector := Split (S4, "|");
   S5: Bounded_String := To_Bounded_String ("one");   --  Just one part.
   V5: Vector := Split (S5, "|");
   -- The delimiter doesn't have to be one character.
   S6: Bounded_String := To_Bounded_String ("foo<=>bar");
   V6: Vector := Split (S6, "<=>");

begin
   Print_Vector ("S1", S1, V1);
   Print_Vector ("S2", S2, V2);
   Print_Vector ("S3", S3, V3);
   Print_Vector ("S4", S4, V4);
   Print_Vector ("S5", S5, V5);
   Print_Vector ("S6", S6, V6);
end split_bounded;

Here's the output:

S1: "Hello, World!|I am fine!|How are you?"
    Part 1: "Hello, World!"
    Part 2: "I am fine!"
    Part 3: "How are you?"
S2: ""
    Part 1: ""
S3: "|"
    Part 1: ""
    Part 2: ""
S4: "||"
    Part 1: ""
    Part 2: ""
    Part 3: ""
S5: "one"
    Part 1: "one"
S6: "foo<=>bar"
    Part 1: "foo"
    Part 2: "bar"

The Unbounded_String type in Ada grows dynamically as needed, but is not as time efficient as fixed strings or bounded strings. For this version, we'll use Ada.Containers.Vectors for a dynamically expending vector, rather than a fixed size vector.

Operations on Unbounded_String are defined in Ada.Strings.Unbounded.

with Ada.Strings.Unbounded; use Ada.Strings.Unbounded;
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Integer_Text_IO; use Ada.Integer_Text_Io;
with Ada.Text_IO.Unbounded_IO; use Ada.Text_IO.Unbounded_IO;
with Ada.Containers.Vectors;
procedure split_unbounded is
   package Unbounded_String_Vectors is new
     Ada.Containers.Vectors (Natural, Unbounded_String);
   use Unbounded_String_Vectors;

   function "+" (Source : in String)
                return Unbounded_String renames To_Unbounded_String;
   subtype UBS_Vector is Unbounded_String_Vectors.Vector;

   function Split (S: Unbounded_String; Pattern: String)
                  return UBS_Vector is
      Start: Positive := 1;
      Position: Natural;
      Num_Parts: Natural := 0;
      V : UBS_Vector;
   begin
      while Start <= Length (S) loop
         Position := Index (S, Pattern, Start);
         exit when Position = 0;
         Append (V, Unbounded_Slice (S, Start, Position - 1));
          -- The pattern can be longer than one character.
         Start := Position + Pattern'Length;
      end loop;
      Num_Parts := Num_Parts + 1;
      Append (V, Unbounded_Slice (S, Start, Length (S)));

      return V;
   end Split;

   procedure Print_UBS_Vector (Label: String;
                               S: Unbounded_String;
                               V: UBS_Vector) is
      N : Natural := 0;
   begin
      Put_Line (Label & ": """ & to_string (s) & """");
      for I in V.First_Index .. V.Last_Index loop
         N := N + 1;
         Put ("    Part "); Put (N, 0); Put (": """); Put (V(I));
         Put_Line ("""");
      end loop;
   end Print_UBS_Vector;

   S1: Unbounded_String := +"Hello, World!|I am fine!|How are you?";
   V1: UBS_Vector := Split (S1, "|");
   S2: Unbounded_String := +"";         --  Empty string.
   V2: UBS_Vector := Split (S2, "|");
   S3: Unbounded_String := +"|";        --  Just one  of pattern.
   V3: UBS_Vector := Split (S3, "|");
   S4: Unbounded_String := +"||";       --  Just two of pattern.
   V4: UBS_Vector := Split (S4, "|");
   S5: Unbounded_String := +"one";      --  Just one part.
   V5: UBS_Vector := Split (S5, "|");
   -- The delimiter doesn't have to be one character.
   S6: Unbounded_String := +"foo<=>bar";
   V6: UBS_Vector := Split (S6, "<=>");

begin
   Print_UBS_Vector ("S1", S1, V1);
   Print_UBS_Vector ("S2", S2, V2);
   Print_UBS_Vector ("S3", S3, V3);
   Print_UBS_Vector ("S4", S4, V4);
   Print_UBS_Vector ("S5", S5, V5);
   Print_UBS_Vector ("S6", S6, V6);
end split_unbounded;

Here's the output:

S1: "Hello, World!|I am fine!|How are you?"
    Part 1: "Hello, World!"
    Part 2: "I am fine!"
    Part 3: "How are you?"
S2: ""
    Part 1: ""
S3: "|"
    Part 1: ""
    Part 2: ""
S4: "||"
    Part 1: ""
    Part 2: ""
    Part 3: ""
S5: "one"
    Part 1: "one"
S6: "foo<=>bar"
    Part 1: "foo"
    Part 2: "bar"
Print Friendly and PDF

Comments

Comments powered by Disqus