C# 过滤字符串中的汉字，效率高点的,该怎么处理

C# 过滤字符串中的汉字，效率高点的
例如: string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

有时候字符串可能会比较长，所以求效率比较好的。。
要求过滤之后：
string content = "13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n";
[最优解释]
if(radioButton1.Checked){ control = form2.zhucex;}
else if(radioButton2.Checked){ control =form3.zhuces;}
else if(radioButton3.Checked){ control = warring;}
else if(radioButton4.Checked){ control =suggest;}
else if(radioButton5.Checked){ control =form4.mumawe;}
else if(radioButton6.Checked){ control =drop;}
if (control =="000000")
{
MessageBox.Show("你没有输入任何控制目标!不发控制信号");
richTextBox1.AppendText("你没有输入任何控制目标!不发控制信号");
}
else if(control != "000000")
{
try
{
//记录操作
richTextBox1.AppendText (control + "正在试图控制,等待回应......" + "\r");
stream = client.GetStream();
if(stream.CanWrite )
{
byte[] by = System.Text.Encoding.ASCII.GetBytes(control.ToCharArray ());
stream.Write(by,0,by.Length);

stream.Flush();
threadReceive =new Thread(new ThreadStart(receive));
threadReceive.Start();
}//endif
}//try
catch
{
richTextBox1.AppendText("服务器未连接1控制无效!" +"\r");
MessageBox.Show("服务器未连接1控制无效!" +"\r");
}
}//else if
}

[其他解释]

Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, "");
richTextBox2.Text = result;

如果对速度要求较高，可以通过foreach遍历，比较unicode来过滤
[其他解释]

引用:

C# code
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\……

顶
[其他解释]

引用:

C# code
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\……

Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
使用正则是不错的选择
[其他解释]
过客下班了。真早。
[其他解释]

引用:

如果对速度要求较高，可以通过foreach遍历，比较unicode来过滤

foreach遍历比较unicode应该不会比正则快的
[其他解释]
使用正则
[其他解释]
foreach要以每个字符为单位遍历，不如正则快
------其他解决方案--------------------

贴出来吧。没测试速度。

private static void TestChinese()
{
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    StringBuilder builder = new StringBuilder(content.Length);
    for (int i = 0; i &lt; content.Length; i++)
    {
        int n = char.ConvertToUtf32(content,i);
        if (n &lt; 0x4e00 
[其他解释]
 n &gt; 0x9fa5)
        {
            builder.Append(content[i]);
        }
    }
    Console.WriteLine(builder.ToString());
}

[其他解释]
学习~~
[其他解释]
先汉字判断
private static int isCharacter(String word){
byte[] str_byte = null;
str_byte = word.substring(0, 1).getBytes();
if(str_byte.length==2){
return 1;//是汉字
}else{
return 0;//不是汉字
}
}

接着过滤汉字..
system.text.regularexpressions.regex regex = new system.text.regularexpressions.regex( "[\u4e00-\u9fa5]");
string replacedstring = regex.replace(str,"");//如果存在指定编码的字符串则过滤掉

[其他解释]
可能我写的不好，20次对比，还是差不多的。正则还快一点，可能要考虑反复构建正则引擎时候，上一次没有释放带来的优化，不知道有没有。


static void Main(string[] args)
{
    for (int x = 0; x &lt; 20; x++)
    {
        int tick = Environment.TickCount; 
 
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01();
        tick = Environment.TickCount - tick;
        int tick1 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01();
        tick1 = Environment.TickCount - tick1;
        Console.WriteLine(tick.ToString() + "," + tick1.ToString());
    }
    Console.ReadKey();
}

private static void TestChineseRegex01()
{
    Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    string result = reg.Replace(content, "");
    //Console.WriteLine(result);
}

private static void TestChinese()
{
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    StringBuilder builder = new StringBuilder(content.Length);
    for (int i = 0; i &lt; content.Length; i++)
    {
        int n = char.ConvertToUtf32(content, i);
        if (n &lt; 0x4e00 
[其他解释]
 n &gt; 0x9fa5)
        {
            builder.Append(content[i]);
        }
    }
    //Console.WriteLine(builder.ToString());
}

几次测试结果：
171,172
156,187
172,156
171,172
156,172
171,172
156,171
172,156
172,156
187,203
171,187
141,171
156,172
172,156
187,171
156,172
156,172
156,171
156,172
171,156
-------
165.3170.85<-平均值
正则 165.3 ms
foreach 170.85 ms
[其他解释]
要说效率。嘿嘿。写了个更快的。
直接上对比代码和结果

static void Main(string[] args)
{
    for (int x = 0; x &lt; 20; x++)
    {
        int tick = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01();
        tick = Environment.TickCount - tick;
        int tick1 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01(); 
 
        tick1 = Environment.TickCount - tick1;
        int tick2 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseLinq01();
        tick2 = Environment.TickCount - tick2;
        Console.WriteLine(tick.ToString() + "\t" + tick1.ToString() + "\t" + tick2.ToString());
    }
    Console.ReadKey();
}

private static void TestChineseLinq01()
{
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    string result = new string(content.Where(c =&gt; ((uint)c &lt; 0x4e00 
[其他解释]
 (uint)c &gt; 0x9fa5)).ToArray());
    //Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
    Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    string result = reg.Replace(content, "");
    //Console.WriteLine(result);
}

对比结果


regex foreach  linq
203     171     94
156     172     78
171     172     78
171     156     78
172     172     78
171     156     94
156     171     78
156     156     94
156     172     78
156     171     78
172     156     78
171     188     93
156     156     94
156     187     140
156     172     94
156     171     78
172     171     78
156     172     78
172     156     78
171     156     78

[其他解释]
我也贴一个,用数组可能会快点,100万次1.6秒 ,T7500的cpu


        void kickoffChinese()
        {
            string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n"; 
 

            char[] chars = content.ToCharArray();
            int offset = 0;
            char[] result = new char[chars.Length];
            for (int i = 0; i &lt; chars.Length; i++)
            {
                if (chars[i] &lt; 0x4E00 
[其他解释]
论坛眼神最差的算我了，仔细看才发现，居然2次对比的都是regex的。。。。
重贴测试代码

static void Main(string[] args)
{
    Console.WriteLine("regex" + "\t\t" + "foreach" + "\t\t" + "linq");
    for (int x = 0; x &lt; 20; x++)
    {
        int tick = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01();
        tick = Environment.TickCount - tick;
        int tick1 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseForeach01();
        tick1 = Environment.TickCount - tick1;
        int tick2 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseLinq01();
        tick2 = Environment.TickCount - tick2;
        Console.WriteLine(tick.ToString() + "\t\t" + tick1.ToString() + "\t\t" + tick2.ToString());
    }
    Console.ReadKey();
}

private static void TestChineseLinq01()
{
    string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    string result = new string(content.Where(c =&gt; ((uint)c &lt; 0x4e00 
[其他解释]
 chars[i] &gt; 0x9FA5)
                {
                    result[offset] = chars[i];
                    offset++;
                }
            }
            char[] dest=new char[offset];
            Array.Copy(result, dest, offset); 
 
            //Console.WriteLine(new string(dest));
        }


[其他解释]
 n &gt; 0x9fa5)
        {
            builder.Append(content[i]);
        }
    }
    //Console.WriteLine(builder.ToString());
}

测试结果

regex foreach linq
203 47 94
171 31 94
156 47 78
171 47 78
156 47 62
172 31 78
156 47 78
156 47 78
156 47 78
156 31 78
218 47 78
172 31 78
171 47 78
156 47 78
172 46 78
156 47 78
172 31 78
156 47 78
171 32 78

171 31 78

如客客师傅说的，foreach 最快。
[其他解释]
(uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, "");
//Console.WriteLine(result);
}

private static void TestChineseForeach01()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
StringBuilder builder = new StringBuilder(content.Length);
for (int i = 0; i < content.Length; i++)
{
int n = char.ConvertToUtf32(content, i);
if (n < 0x4e00
[其他解释]
本帖最后由 lxcnn 于 2010-05-27 19:04:24 编辑无牙，效率不是这样比较的哈，因为正则不能这样用

如果你用循环对比的话，需要在循环体外声明正则，这样对正则才算公平

之所以认为foreach会比正则快，是因为用foreach时的优化空间比正则大一些
当然，同时还需要关注一下内存占用情况

不过话说回来，如果不是百万级的数据，一般是感觉不到处理效率差异的
[其他解释]
这么多高手过招，受益匪浅啊。
[其他解释]
按14楼的方法做了个比较


REGEX           linq            kickoffChinese
1563215
1574715
1723116
1564716
1404716
1563116
1564716
1564715
1414716
1404716
1563116
1564716
1404716
1404716
1404716
1564716
1404716
1404716
1414616
1563215

[其他解释]
(uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, string.Empty);
//Console.WriteLine(result);
}

private static void TestChineseForeach01()
{
builder.Clear();
for (int i = 0; i < content.Length; i++)
{
if (content[i] < 0x4e00

[其他解释]
好吧，按客客说的，我吧regex声明放外面，似乎也改变不大。

static void Main(string[] args)
{
    Console.WriteLine("regex\t\tforeach\t\tlinq\t\tkickoffChinese");
    for (int x = 0; x &lt; 20; x++)
    {
        int tick = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseRegex01();
        tick = Environment.TickCount - tick;
        int tick1 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseForeach01();
        tick1 = Environment.TickCount - tick1;
        int tick2 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) TestChineseLinq01();
        tick2 = Environment.TickCount - tick2;
        int tick3 = Environment.TickCount;
        for (int i = 0; i &lt; 10000; i++) kickoffChinese();
        tick3 = Environment.TickCount - tick3;
        Console.WriteLine(tick.ToString() + "\t\t" + tick1.ToString() + "\t\t" + tick2.ToString() + "\t\t" + tick3.ToString());
    }
    Console.ReadKey();
}

static Regex reg = new Regex(@"[\u4e00-\u9fa5]+", RegexOptions.Compiled);
static string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
static StringBuilder builder = new StringBuilder(content.Length);
        
private static void TestChineseLinq01()
{
    //string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
    string result = new string(content.Where(c =&gt; ((uint)c &lt; 0x4e00 
[其他解释]
 chars[i] &gt; 0x9FA5)
        {
            result[offset] = chars[i];
            offset++;
        }
    }
    char[] dest = new char[offset];
    Array.Copy(result, dest, offset);
    //Console.WriteLine(new string(dest));
}

结果

regex foreach linq kickoffChinese

140 60 261 40
130 60 170 50
181 70 190 50
120 60 170 41
130 60 170 40
130 60 171 50
120 60 170 40
130 61 170 40
130 60 160 50
121 70 170 50
120 70 241 60
170 70 180 50
151 60 170 50
120 60 171 40
130 60 160 50
130 60 161 50

120 60 170 40
120 61 170 40
130 60 160 50
121 60 170 40

skep99确实基础扎实，多次都是能给出效率很高的解法。
[其他解释]
content[i] > 0x9fa5)
{
builder.Append(content[i]);
}
}
//Console.WriteLine(builder.ToString());
}

static void kickoffChinese()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

char[] chars = content.ToCharArray();
int offset = 0;
char[] result = new char[chars.Length];
for (int i = 0; i < chars.Length; i++)
{
if (chars[i] < 0x4E00

C# 过滤字符串中的汉字效率高点的,该

热点推荐