Appearance
数据结构与文件处理
Python 处理运维数据时最常用的是列表、字典、元组、集合、文件、JSON 和正则。主机清单、接口返回、命令输出、配置文件和日志行都可以落到这些结构里。
数据结构选对,脚本后续处理能少很多绕路。主机列表适合用列表,主机属性适合用字典,唯一值去重适合用集合,固定结构的返回值适合用元组。
一、列表
列表是有顺序的集合,适合保存一组主机、服务、文件路径。
python
hosts = ["web01", "web02", "db01"]
hosts.append("redis01") # 追加一个元素
hosts.remove("db01") # 删除指定元素
first_host = hosts[0] # 取第一个元素
last_host = hosts[-1] # 取最后一个元素
print(hosts)
print(first_host, last_host)切片:
python
hosts = ["web01", "web02", "web03", "db01"]
print(hosts[0:2]) # ['web01', 'web02']
print(hosts[2:]) # ['web03', 'db01']列表推导式适合简单转换:
python
hosts = ["web01", "web02", "db01"]
web_hosts = [host for host in hosts if host.startswith("web")]
print(web_hosts) # ['web01', 'web02']推导式太长时改成普通 for 循环。条件多、还要打印日志时,普通循环更容易排查。
二、字典
字典是 key-value 结构,适合保存配置和对象属性。
python
service = {
"name": "nginx",
"port": 80,
"enabled": True,
}
print(service["name"])
print(service.get("owner", "unknown")) # key 不存在时返回默认值更新字段:
python
service["port"] = 8080
service["status"] = "running"遍历:
python
for key, value in service.items():
print(f"{key}={value}")嵌套字典很常见,比如主机清单:
python
inventory = {
"web01": {"ip": "192.168.10.11", "role": "web"},
"db01": {"ip": "192.168.10.21", "role": "mysql"},
}
for hostname, info in inventory.items():
print(f"{hostname} {info['ip']} {info['role']}")访问嵌套字段时要确认 key 是否存在。外部 API 返回的数据结构不稳定时,直接 data["a"]["b"] 容易报 KeyError。
三、元组和集合
元组是不可变序列,适合表示固定结构的返回值:
python
def parse_address(address):
host, port_text = address.split(":")
return host, int(port_text)
host, port = parse_address("127.0.0.1:22")
print(host, port)集合用于去重和成员判断:
python
raw_hosts = ["web01", "web01", "web02", "db01"]
unique_hosts = set(raw_hosts)
print(unique_hosts)
print("web01" in unique_hosts)集合没有稳定顺序。需要按顺序输出时再排序:
python
for host in sorted(unique_hosts):
print(host)四、文件读取
pathlib.Path 比直接拼字符串更适合处理路径。
python
from pathlib import Path
path = Path("/etc/hosts")
content = path.read_text(encoding="utf-8")
print(content)按行读取大文件更稳:
python
from pathlib import Path
path = Path("/var/log/messages")
with path.open("r", encoding="utf-8", errors="ignore") as file:
for line in file:
if "ERROR" in line:
print(line.strip())with 会在代码块结束后自动关闭文件句柄。日志文件很大时,按行读取不会一次把整个文件加载进内存。
判断文件和目录:
python
from pathlib import Path
path = Path("/data/backup")
if path.exists():
print(f"exists: {path}")
if path.is_dir():
print(f"is directory: {path}")五、文件写入
写文本:
python
from pathlib import Path
report = Path("/tmp/check-report.txt")
lines = [
"host=web01 status=ok",
"host=db01 status=warning",
]
report.write_text("\n".join(lines) + "\n", encoding="utf-8")追加写入:
python
from pathlib import Path
log_file = Path("/tmp/ops-script.log")
with log_file.open("a", encoding="utf-8") as file:
file.write("script started\n")创建目录:
python
from pathlib import Path
output_dir = Path("/tmp/ops-output")
output_dir.mkdir(parents=True, exist_ok=True) # 父目录不存在时一起创建写配置、报告、临时文件时,先写到临时文件再替换正式文件更稳:
python
from pathlib import Path
target = Path("/tmp/app.conf")
tmp = target.with_suffix(".conf.tmp")
tmp.write_text("port=8080\n", encoding="utf-8")
tmp.replace(target) # 同文件系统内替换,避免写到一半留下半截文件六、JSON
接口返回和配置文件经常用 JSON。Python 里用 json 标准库处理。
python
import json
raw = '{"name": "nginx", "port": 80}'
data = json.loads(raw)
print(data["name"])对象转 JSON 字符串:
python
import json
service = {
"name": "nginx",
"port": 80,
"enabled": True,
}
text = json.dumps(service, ensure_ascii=False, indent=2)
print(text)读写 JSON 文件:
python
import json
from pathlib import Path
path = Path("/tmp/service.json")
data = {
"name": "nginx",
"port": 80,
}
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
loaded = json.loads(path.read_text(encoding="utf-8"))
print(loaded["port"])ensure_ascii=False 会保留中文,不把中文转成 \uXXXX。排查配置和报告时,可读性更好。
七、正则表达式
正则适合从非结构化文本里抽字段,比如日志行、命令输出。能用 JSON、CSV、YAML 这类结构化格式时,优先用对应解析器;正则更适合处理没有固定结构的数据。
python
import re
line = "2026-05-29 10:11:12 ERROR disk full path=/data usage=95%"
pattern = r"(?P<time>\S+ \S+) (?P<level>\S+) (?P<message>.*)"
match = re.match(pattern, line)
if match:
print(match.group("time"))
print(match.group("level"))
print(match.group("message"))提取 IP:
python
import re
text = "failed login from 192.168.10.11, next 10.0.0.5"
ips = re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text)
print(ips)替换敏感信息:
python
import re
line = "password=123456 token=abcdef"
masked = re.sub(r"(password|token)=\S+", r"\1=***", line)
print(masked)正则能跑通不代表足够准确。IP、时间、URL、SQL 这类格式如果要求严格,表达式要按实际日志格式收紧。
八、CSV 简单处理
主机清单有时会从表格导出成 CSV。
python
import csv
from pathlib import Path
path = Path("/tmp/hosts.csv")
with path.open("r", encoding="utf-8", newline="") as file:
reader = csv.DictReader(file)
for row in reader:
# DictReader 会按表头生成字典,字段名直接来自 CSV 第一行
print(row["hostname"], row["ip"], row["role"])写 CSV:
python
import csv
from pathlib import Path
path = Path("/tmp/report.csv")
rows = [
{"hostname": "web01", "status": "ok"},
{"hostname": "db01", "status": "warning"},
]
with path.open("w", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["hostname", "status"])
writer.writeheader()
writer.writerows(rows)CSV 不适合保存复杂嵌套结构。主机属性很多、还有列表或嵌套对象时,JSON 或 YAML 更自然。
九、一个小报告脚本
这个脚本读取主机 JSON,筛出 Web 主机,生成文本报告。
python
#!/usr/bin/env python3
"""从主机清单生成 Web 主机报告。"""
import json
from pathlib import Path
def load_inventory(path):
# 清单文件不存在或 JSON 格式错误时,让异常暴露给 main 统一处理
return json.loads(path.read_text(encoding="utf-8"))
def select_hosts(inventory, role):
selected = []
for host in inventory:
if host.get("role") == role:
selected.append(host)
return selected
def render_report(hosts):
lines = ["Web hosts:"]
for host in hosts:
lines.append(f"- {host['hostname']} {host['ip']}")
return "\n".join(lines) + "\n"
def main():
inventory_path = Path("/tmp/inventory.json")
report_path = Path("/tmp/web-hosts.txt")
inventory = load_inventory(inventory_path)
web_hosts = select_hosts(inventory, "web")
report_path.write_text(render_report(web_hosts), encoding="utf-8")
print(f"report written: {report_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())输入示例:
json
[
{"hostname": "web01", "ip": "192.168.10.11", "role": "web"},
{"hostname": "db01", "ip": "192.168.10.21", "role": "mysql"}
]这个脚本虽然很小,但已经有了运维脚本常见结构:读取输入、筛选数据、生成输出、返回状态。