Here is a working solution:
1. Trim unwanted leading characters
2. remove unwanted words
3. remove unwanted characters trailing unwanted words
using System.Text;
string file = "data.txt";
string[] wordsToRemove = { "Home", "House", "Room" };
char[] seperators = {';', ' ', '.', ',', '!', '?', ':'};
string rawText = File.ReadAllText(file);
bool isCapturing = false;
bool isTrimming = false;
int start = -1;
StringBuilder sb = new();
for (int i = 0; i < rawText.Length; i++)
{
if (start == -1 && char.IsLetterOrDigit(rawText[i]))
{
isCapturing = true;
isTrimming = false;
start = i;
}
if (start == -1 && isCapturing)
{
if (isTrimming && rawText[i].Equals('!'))
{
isTrimming = false;
continue;
}
if (!isTrimming)
sb.Append(rawText[i]);
}
if (start > -1 && seperators.Contains(rawText[i]))
{
if (!wordsToRemove.Any(x => x
.Equals(rawText.Substring(start, i - start),
StringComparison.InvariantCultureIgnoreCase)))
sb.Append(rawText.Substring(start, i - start + 1));
else
isTrimming = true;
start = -1;
}
}
Console.WriteLine(sb);
Output:
fhgkHouse!Dog;;;!!Inside!C!Table!London!Computer!
UPDATE
@PIEBALDconsult, Here is a regex version just for you...
string file = "data.txt";
string[] wordsToRemove = { "Home", "House", "Room" };
string rawText = File.ReadAllText(file);
string pattern = $"^.*?(?=[a-z])|(?<![a-z])((?=(?:{string.Join("|", wordsToRemove)}))(.*?)(?:\\!|\\z))";
string result = Regex.Replace(rawText, pattern, "", RegexOptions.IgnoreCase);
Console.WriteLine(result);
Output:
fhgkHouse!Dog;;;!!Inside!C!Table!London!Computer!
For an explanation of how it works, paste the regular expression and Test string into
regex101: build, test, and debug regex[
^]
Enjoy!