This repository has been archived on 2024-11-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
XML-Cleaner-CS/xmlparser/Program.cs
Bertalan Krisztián 53ad483e35 initial commit
2024-08-13 14:30:32 +02:00

67 lines
1.7 KiB
C#

using System;
using System.Xml.Linq;
using System.Linq;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
using static System.Runtime.InteropServices.JavaScript.JSType;
namespace xmlparser
{
class Program
{
static void Main(string[] args)
{
XNamespace ns = "http://ijr.hu/schema/jogszabaly.xsd";
string[] files = Directory.GetFiles(@"xml_files", "*.xml");
string resultText = "";
foreach (var filepath in files)
{
XDocument doc = XDocument.Load(filepath);
var content = doc.Descendants(XName.Get("tartalom", ns.ToString())).FirstOrDefault();
resultText += ExtractText(content);
}
resultText = RemoveTagsFromXmlLikeString(resultText);
resultText = RemoveExtraSpaces(resultText);
File.WriteAllText("output.txt", resultText);
}
static string ExtractText(XElement element)
{
var textBuilder = new StringBuilder();
foreach (var node in element.DescendantNodesAndSelf())
{
textBuilder.Append((node as XText)?.Value ?? (node as XCData)?.Value);
textBuilder.Append(" ");
}
return textBuilder.ToString();
}
static string RemoveTagsFromXmlLikeString(string input)
{
string pattern = @"<[^>]+?>";
return Regex.Replace(input, pattern, "");
}
public static string RemoveExtraSpaces(string sender)
{
const RegexOptions options = RegexOptions.None;
var regex = new Regex("[ ]{2,}", options);
return regex.Replace(sender, " ").Trim();
}
}
}