Python正则表达式re模块完全指南（2026年03月27日）

文档摘要

Python正则表达式re模块完全指南（2026年03月27日） Python的模块提供了强大的正则表达式（Regular Expression）操作能力，是文本处理、数据清洗、模式匹配的必备工具。掌握正则表达式能够大幅提升文本处理的效率。基础概念正则表达式简介正则表达式是一种用于匹配字符串模式的强大工具，通过特定的语法规则描述字符串的特征： re模块核心方法元字符详解字符匹配位置匹配量词字符类分组与捕获基本分组反向引用高级特性零宽断言编译正则表达式正则表达式标志实战应用场景邮箱验证手机号验证身份证号验证 URL提取 HTML标签清理敏感词过滤日志解析数据提取文本替换 CSV解析（简单场景）性能优化预编译正则表达式避免贪婪匹配

Python正则表达式re模块完全指南（2026年03月27日）

Python的re模块提供了强大的正则表达式（Regular Expression）操作能力，是文本处理、数据清洗、模式匹配的必备工具。掌握正则表达式能够大幅提升文本处理的效率。

基础概念

正则表达式简介

正则表达式是一种用于匹配字符串模式的强大工具，通过特定的语法规则描述字符串的特征：


import re

# 简单匹配
pattern = r'hello'
text = 'hello world'
result = re.search(pattern, text)
print(result.group())  # 输出: hello

re模块核心方法


import re

# re.match() - 从字符串开头匹配
result = re.match(r'hello', 'hello world')  # 匹配成功
result = re.match(r'world', 'hello world')  # 返回None

# re.search() - 扫描整个字符串，返回第一个匹配
result = re.search(r'world', 'hello world')  # 匹配成功

# re.findall() - 返回所有非重叠匹配
results = re.findall(r'\d+', 'abc123def456')  # ['123', '456']

# re.finditer() - 返回迭代器
for match in re.finditer(r'\d+', 'abc123def456'):
    print(match.group())  # 123, 456

# re.sub() - 替换匹配项
result = re.sub(r'\d+', 'X', 'abc123def456')  # abcXdefX

# re.split() - 按模式分割字符串
parts = re.split(r'[,;]', 'a,b;c,d')  # ['a', 'b', 'c', 'd']

元字符详解

1. 字符匹配


# . - 匹配任意字符（除换行符）
re.search(r'.', 'a')      # 匹配 'a'
re.search(r'..', 'ab')    # 匹配 'ab'
re.search(r'...', 'abc')  # 匹配 'abc'

# \d - 匹配数字 [0-9]
re.findall(r'\d', 'a1b2c3')  # ['1', '2', '3']

# \D - 匹配非数字 [^0-9]
re.findall(r'\D', 'a1b2c3')  # ['a', 'b', 'c']

# \w - 匹配字母数字下划线 [a-zA-Z0-9_]
re.findall(r'\w', 'a_b 1')  # ['a', '_', 'b', '1']

# \W - 匹配非字母数字下划线
re.findall(r'\W', 'a_b 1')  # [' ']

# \s - 匹配空白字符 [ \t\n\r\f\v]
re.findall(r'\s', 'a b\tc')  # [' ', '\t']

# \S - 匹配非空白字符
re.findall(r'\S', 'a b\tc')  # ['a', 'b', 'c']

2. 位置匹配


# ^ - 字符串开头
re.search(r'^hello', 'hello world')  # 匹配
re.search(r'^world', 'hello world')  # 不匹配

# $ - 字符串结尾
re.search(r'world$', 'hello world')  # 匹配

# \b - 单词边界
re.search(r'\bhello\b', 'hello world')  # 匹配
re.search(r'\bhello\b', 'helloworld')   # 不匹配

# \B - 非单词边界
re.search(r'\Bhello\B', 'helloworld')   # 匹配

# ^ 和 $ 在多行模式中
re.findall(r'^\w+', 'hello\nworld', re.MULTILINE)  # ['hello', 'world']

3. 量词


# * - 0次或多次
re.findall(r'a*', 'baab')  # ['', 'aa', '', '']

# + - 1次或多次
re.findall(r'a+', 'baab')  # ['aa']

# ? - 0次或1次
re.findall(r'a?', 'baab')  # ['', 'a', 'a', '']

# {n} - 恰好n次
re.findall(r'a{2}', 'baaab')  # ['aa']

# {n,} - 至少n次
re.findall(r'a{2,}', 'baaaab')  # ['aaaa']

# {n,m} - n到m次
re.findall(r'a{2,3}', 'baaaab')  # ['aaa']

# 贪婪与非贪婪
re.findall(r'a.*', 'aabaa')      # ['aabaa'] (贪婪)
re.findall(r'a.*?', 'aabaa')     # ['a', 'aa'] (非贪婪)

4. 字符类


# [...] - 匹配字符集中任意字符
re.findall(r'[abc]', 'a1b2c3')  # ['a', 'b', 'c']

# [^...] - 匹配不在字符集中的字符
re.findall(r'[^abc]', 'a1b2c3')  # ['1', '2', '3']

# 范围表示
re.findall(r'[a-z]', 'AaBbCc')  # ['a', 'b', 'c']
re.findall(r'[0-9]', 'a1b2c3')  # ['1', '2', '3']

# 预定义字符类
re.findall(r'[0-9]', 'a1b2c3')  # 等同于 \d
re.findall(r'[^0-9]', 'a1b2c3')  # 等同于 \D

分组与捕获

基本分组


# (...) - 分组
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2026-03-27')
print(match.group(0))    # '2026-03-27' (整个匹配)
print(match.group(1))    # '2026' (第1组)
print(match.group(2))    # '03' (第2组)
print(match.group(3))    # '27' (第3组)

# 命名分组
match = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', '2026-03-27')
print(match.group('year'))   # '2026'
print(match.group('month'))  # '03'
print(match.group('day'))    # '27'

# 非捕获分组 (?:...)
re.findall(r'(?:\d{4})-(\d{2})-(\d{2})', '2026-03-27')  # [('03', '27')]

反向引用


# \n - 引用第n个分组
re.search(r'(\w+)\s+\1', 'hello hello')  # 匹配
re.search(r'(\w+)\s+\1', 'hello world')  # 不匹配

# 命名分组引用
re.search(r'(?P<word>\w+)\s+(?P=word)', 'hello hello')  # 匹配

高级特性

1. 零宽断言


# 正向先行断言 (?=...) - 匹配...之前的位置
re.findall(r'\w+(?=\s*script)', 'javascript script coffee')  # ['java']

# 负向先行断言 (?!...) - 不匹配...之前的位置
re.findall(r'\w+(?!\s*script)', 'javascript coffee')  # ['javascript', 'coffee']

# 正向后行断言 (?<=...) - 匹配...之后的位置
re.findall(r'(?<=\$)\d+', 'Price: $100, $200')  # ['100', '200']

# 负向后行断言 (?<!...) - 不匹配...之后的位置
re.findall(r'(?<!\$)\d+', 'Price: 100, $200')  # ['100']

2. 编译正则表达式


# 预编译提升性能
pattern = re.compile(r'\d+')
results = pattern.findall('a1b2c3')  # ['1', '2', '3']

# 编译时指定标志
pattern = re.compile(r'hello', re.IGNORECASE)
pattern.search('HELLO')  # 匹配

3. 正则表达式标志


# re.IGNORECASE (或 re.I) - 忽略大小写
re.search(r'hello', 'HELLO', re.IGNORECASE)  # 匹配

# re.MULTILINE (或 re.M) - 多行模式
re.findall(r'^\w+', 'hello\nworld', re.MULTILINE)  # ['hello', 'world']

# re.DOTALL (或 re.S) - 点匹配换行符
re.search(r'.+', 'hello\nworld', re.DOTALL)  # 匹配 'hello\nworld'

# re.VERBOSE (或 re.X) - 详细模式（允许注释）
pattern = r'''
    \d{4}    # 年份
    -        # 分隔符
    \d{2}    # 月份
    -        # 分隔符
    \d{2}    # 日期
'''
re.search(pattern, '2026-03-27', re.VERBOSE)  # 匹配

# re.ASCII (或 re.A) - ASCII模式
re.findall(r'\w+', 'café', re.ASCII)  # ['ca']

# 组合标志
re.search(r'hello', 'HELLO\nWORLD', re.IGNORECASE | re.DOTALL)

实战应用场景

1. 邮箱验证


def validate_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

print(validate_email('user@example.com'))      # True
print(validate_email('user.name+tag@example.co.uk'))  # True
print(validate_email('invalid.email'))          # False

2. 手机号验证


def validate_phone(phone):
    # 中国大陆手机号（11位，1开头）
    pattern = r'^1[3-9]\d{9}$'
    return bool(re.match(pattern, phone))

print(validate_phone('13812345678'))  # True
print(validate_phone('12345678901'))  # False

3. 身份证号验证


def validate_id_card(id_card):
    # 18位身份证号
    pattern = r'^[1-9]\d{5}(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]$'
    return bool(re.match(pattern, id_card))

print(validate_id_card('11010519900307299X'))  # True

4. URL提取


def extract_urls(text):
    pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w .-]*/?'
    return re.findall(pattern, text)

text = '''
访问 https://www.example.com 和 http://blog.example.com/post/123
'''
print(extract_urls(text))
# ['https://www.example.com', 'http://blog.example.com/post/123']

5. HTML标签清理


def strip_html_tags(html):
    pattern = r'<[^>]+>'
    return re.sub(pattern, '', html)

html = '<p>Hello <strong>world</strong>!</p>'
print(strip_html_tags(html))  # 'Hello world!'

6. 敏感词过滤


def filter_sensitive_words(text, words):
    # 按长度降序排序，避免短词优先匹配
    words = sorted(words, key=len, reverse=True)
    pattern = '|'.join(map(re.escape, words))
    return re.sub(pattern, '***', text)

text = "这是测试，包含敏感词汇"
sensitive_words = ['敏感词', '测试']
print(filter_sensitive_words(text, sensitive_words))
# '这是***，包含***词汇'

7. 日志解析


def parse_apache_log(line):
    # Apache日志格式
    pattern = r'(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s?(\S+)?\s?(\S+)?" (\d{3}) (\d+|-)'
    match = re.search(pattern, line)
    if match:
        return {
            'ip': match.group(1),
            'timestamp': match.group(2),
            'method': match.group(3),
            'path': match.group(4),
            'protocol': match.group(5),
            'status': match.group(6),
            'size': match.group(7)
        }

log = '127.0.0.1 - - [27/Mar/2026:10:30:00 +0800] "GET /index.html HTTP/1.1" 200 2326'
print(parse_apache_log(log))

8. 数据提取


def extract_numbers(text):
    # 提取所有数字（包括小数和负数）
    pattern = r'-?\d+\.?\d*'
    return re.findall(pattern, text)

text = '温度: 25.5°C, 湿度: 60%, 价格: -12.3元'
print(extract_numbers(text))  # ['25.5', '60', '-12.3']

9. 文本替换


def format_date(text):
    # 将 "2026年03月27日" 格式转换为 "2026-03-27"
    pattern = r'(\d{4})年(\d{2})月(\d{2})日'
    return re.sub(pattern, r'\1-\2-\3', text)

text = '今天是2026年03月27日'
print(format_date(text))  # '今天是2026-03-27'

10. CSV解析（简单场景）


def parse_simple_csv(line):
    # 简单CSV行解析（不支持引号内的逗号）
    pattern = r'([^,]+)'
    return re.findall(pattern, line)

line = '张三,25,北京,工程师'
print(parse_simple_csv(line))  # ['张三', '25', '北京', '工程师']

性能优化

1. 预编译正则表达式


# 不推荐：每次调用都重新编译
def extract_emails(text):
    return re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', text)

# 推荐：预编译提升性能
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')

def extract_emails(text):
    return EMAIL_PATTERN.findall(text)

2. 避免贪婪匹配


# 不推荐：贪婪匹配可能导致回溯
pattern = r'<.+>'
re.search(pattern, '<a>hello</a>')  # 匹配 '<a>hello</a>'

# 推荐：使用非贪婪或更精确的模式
pattern = r'<.+?>'
re.search(pattern, '<a>hello</a>')  # 匹配 '<a>'

3. 使用原子分组（Python 3.11+）


# (?>...) - 原子分组，防止回溯
pattern = r'(?>\d+)'

调试技巧

1. 使用re.DEBUG查看匹配过程


import re
pattern = re.compile(r'\d+', re.DEBUG)
# 输出详细的编译信息

2. 使用在线工具

推荐工具：

regex101.com（支持Python语法）
regexr.com（交互式学习）
pythex.org（Python专用）

常见陷阱

1. 忘记转义


# 错误：点号匹配任意字符
re.search(r'example.com', 'exampleXcom')  # 匹配（错误）

# 正确：转义点号
re.search(r'example\.com', 'example.com')  # 匹配（正确）

2. 贪婪匹配问题


# 错误：贪婪匹配过度
re.findall(r'<.+>', '<a>hello</a><b>world</b>')  # ['<a>hello</a><b>world</b>']

# 正确：非贪婪匹配
re.findall(r'<.+?>', '<a>hello</a><b>world</b>')  # ['<a>', '</a>', '<b>', '</b>']

3. 换行符处理


# 默认：点号不匹配换行符
re.search(r'hello.+world', 'hello\nworld')  # 不匹配

# 使用re.DOTALL标志
re.search(r'hello.+world', 'hello\nworld', re.DOTALL)  # 匹配

总结

Python的re模块是处理文本的强大工具，掌握正则表达式能够：

提升效率：一行代码替代复杂循环
精确匹配：灵活的模式匹配能力
通用技能：正则表达式在多种语言中通用

关键要点：

理解元字符的含义和用法
掌握分组和捕获的机制
熟练使用预编译提升性能
注意贪婪匹配的陷阱
合理使用非贪婪模式
复杂模式使用VERBOSE标志增加可读性

在2026年的今天，虽然出现了许多高级的文本处理库（如spaCy、transformers），正则表达式依然是快速、轻量级文本处理的首选方案。对于简单的模式匹配、数据清洗、格式验证等场景，正则表达式依然是最优选择。

记住：正则表达式不是万能的，对于复杂的文本解析任务（如HTML解析），应该使用专门的解析库（如BeautifulSoup、lxml）；但对于灵活的文本匹配和替换任务，正则表达式依然是不可或缺的工具。