如何用正则表达式取出表格中的数据? [复制链接]查看:2245回复:0

1#
如何用正则表达式取出表格中的数据? <td style="width: 2%" title='Diamond On Memo'                                                             class='mt'>                                                             M                                                         </td> 这个是“mt”。。。 C# code     Regex regTR = new Regex(@"(?is)<tr>\s*(?:<td(??!class=).)*class=([""']?)(?:td\d*|mtt?\d*)\1>(??!</td>).)*</td>\s*){5}(?<content>(?:<td(??!class=).)*class=([""']?)(?:td\d*|mtt?\d*)\2>(??!</td>).)*</td>\s*)+)</tr>", RegexOptions.Compiled); Regex regTD = new Regex(@"(?is)(?<=<td[^>]*>)(?!\s*</td)(??!</td\b).)*(?=</td>)", RegexOptions.Compiled); Regex regRe = new Regex(@"<[^>]*>|\s+", RegexOptions.Compiled); MatchCollection mcTR = regTR.Matches(yourStr); foreach (Match mTR in mcTR) { MatchCollection mcTD = regTD.Matches(mTR.Groups["content"].Value); foreach (Match mTD in mcTD) { richTextBox2.Text += regRe.Replace(mTD.Value, "") + "\n"; } richTextBox2.Text += "\n------------------------------\n"; } 另外需要说明的一点是,我写的代码里,把 <td...>和 </td>之间,去掉html标签后为空的内容过滤掉了,如果不想过滤 C# code     Regex regTD = new Regex(@"(?is)(?<=<td[^>]*>)(?!\s*</td)(?:(?!</td\b).)*(?=</td>)", RegexOptions.Compiled); Regex regRe = new Regex(@"<[^>]*>|\s+", RegexOptions.Compiled); //替换为 Regex regTD = new Regex(@"(?is)(?<=<td[^>]*>)(?:(?!</td\b).)*(?=</td>)", RegexOptions.Compiled); //去掉html标签为空白字符串不过滤 Regex regRe = new Regex(@"<[^>]*>", RegexOptions.Compiled); //只去掉html标签,不过滤空白字符 可以再网上找找关于筛选html标签,只取文本的相关文章。 C# code     /// <summary> /// 按字节长度截取字符串(支持截取带HTML代码样式的字符串) /// </summary> /// <param name="param">将要截取的字符串参数</param> /// <param name="length">截取的字节长度</param> /// <param name="end">字符串末尾补上的字符串</param> /// <returns></returns> public static string subStringHTML(string param, int length) { string Pattern = null; MatchCollection m = null; StringBuilder result = new StringBuilder(); int n = 0; char temp; bool isCode = false; //是不是HTML代码 bool isHTML = false; //是不是HTML特殊字符,如  char[] pchar = param.ToCharArray(); for (int i = 0; i < pchar.Length; i++) { temp = pchar; if (temp == '<') { isCode = true; } else if (temp == '&') { isHTML = true; } else if (temp == '>' && isCode) { n = n - 1; isCode = false; } else if (temp == ';' && isHTML) { isHTML = false; } if (!isCode && !isHTML) { n = n + 1; //UNICODE码字符占两个字节 if (System.Text.Encoding.Default.GetBytes(temp + "").Length > 1) { n = n + 1; } } result.Append(temp); if (n >= length) { break; } } result.Append("..."); //取出截取字符串中的HTML标记 string temp_result = result.ToString().Replace("(>)[^<>]*(<?)", "$1$2"); //去掉不需要结素标记的HTML标记 temp_result = temp_result.Replace(@"< (AREA|BASE|BASEFONT|BODY|BR|COL|COLGROUP|DD|DT|FRAME|HEAD|HR|HTML|IMG|INPUT|ISINDEX|LI|LINK|META|OPTION|P|PARAM|TBODY|TD|TFOOT|TH|THEAD|TR|area|base|basefont|body|br|col|colgroup|dd|dt|frame|head|hr|html|img|input|isindex|li|link|meta|option|p|param|tbody|td|tfoot|th|thead|tr)[^<>]* >", ""); //去掉成对的HTML标记 temp_result = temp_result.Replace(@"<([a-zA-Z]+)[^<>]*>(.*?)</\1>", "$2"); //用正则表达式取出标记 Pattern = ("<([a-zA-Z]+)[^<>]*>"); m = Regex.Matches(temp_result, Pattern); ArrayList endHTML = new ArrayList(); foreach (Match mt in m) { endHTML.Add(mt.Result("$1")); } //补全不成对的HTML标记 for (int i = endHTML.Count - 1; i >= 0; i--) { result.Append("</"); result.Append(endHTML); result.Append(">"); } return result.ToString(); } http://topic.csdn.net/u/20090722/21/c4812ab1-f659-49c2-99e4-617148121720.html
分享 转发