记一次小米笔记导入为知笔记

发布于 / 小技巧 / 0 条评论

近期要统一个人数据到NAS端管理,顺手把小米笔记的数据迁移到了为知笔记中。本文记录了整个过程以及要踩的坑。

数据下载

MIUI笔记APP是没有导入导出功能的,而且没有笔记历史版本的功能,所以我直接选择从小米云中下载已经同步了的便签数据。

首先打开小米云 https://i.mi.com/ ,登录,找到云笔记

按下F12,找到加载全部笔记、加载笔记详情、加载历史版本和历史版本详情的API,总结了如下文档:

获取全部笔记

  • API:https://i.mi.com/note/full/page

输出样例:

{
  "result": "ok",
  "retriable": false,
  "code": 0,
  "data": {
    "entries": [
      {
        "snippet": "xxx",
        "modifyDate": 笔记修改时间,
        "colorId": 0,
        "subject": "",
        "alertDate": 0,
        "type": "note",
        "folderId": "xxx",
        "setting": {
          "themeId": 0,
          "stickyTime": 0,
          "version": 0
        },
        "deleteTime": 0,
        "alertTag": 0,
        "id": "xxxxx", # 这里是笔记ID
        "tag": "xxx",
        "createDate": 笔记创建时间,
        "status": "normal",
        "extraInfo": "{\"title\":\"\",\"note_content_type\":\"common\",\"mind_content\":\"\",\"mind_content_plain_text\":\"\"}"
      },
	  ...
	}
}

获取单个笔记详情

  • API:https://i.mi.com/note/note/笔记ID/

输出样例:

{
    "result": "ok",
    "retriable": false,
    "code": 0,
    "data": {
        "entry": {
            "snippet": "xxx",
            "modifyDate": 笔记修改时间,
            "colorId": 0,
            "subject": "",
            "alertDate": 0,
            "type": "note",
            "folderId": "xxx",
            "content": "这里是笔记内容",
            "setting": {
                "themeId": 0,
                "stickyTime": 置顶时间,
                "version": 0
            },
            "deleteTime": 0,
            "alertTag": 0,
            "id": "xxx",
            "tag": "xxx",
            "createDate": 笔记创建时间,
            "status": "normal",
            "extraInfo": "{\"note_content_type\":\"common\",\"mind_content_plain_text\":\"\",\"title\":\"\",\"mind_content\":\"\"}"
        }
    },
    "description": "成功",

}

获取单个笔记的历史版本

  • API:https://i.mi.com/note/full/history/times?id=笔记ID

输出样例:

{
    "result": "ok",
    "retriable": false,
    "code": 0,
    "data": {
        "tvList": [
            {
                "updateTime": 更新时间,
                "version": 这里是版本ID
            },

            ...
        ]
    },
    "description": "成功",

}

获取某一版本内容

  • API:https://i.mi.com/note/full/history?id=笔记ID&version=版本ID
{
    "result": "ok",
    "retriable": false,
    "code": 0,
    "data": {
        "entry": {
            "modifyDate": 修改时间,
            "colorId": 0,
            "alertDate": 0,
            "type": "note",
            "folderId": 0,
            "content": "这里是该版本笔记内容",
            "setting": {
                "themeId": 0,
                "stickyTime": 0,
                "version": 0
            },
            "deleteTime": 0,
            "alertTag": 0,
            "id": "笔记ID",
            "tag": "笔记标签",
            "createDate": 笔记创建时间,
            "status": "normal",
            "extraInfo": "{\"title\":\"\",\"note_content_type\":\"common\",\"mind_content\":\"\",\"mind_content_plain_text\":\"\"}"
        }
    },
    "description": "成功"
}

根据这个API文档,不难写出一个爬虫,把所有笔记都爬下来

import os
import json
import requests
import time

# 你需要提供的全局cookie
cookies = {
    "uLocale":"zh_CN",
    "iplocale":"zh_CN",
    "userId":"xxx",
    "i.mi.com_isvalid_servicetoken":"true",
    "i.mi.com_ph":"xxx",
    "i.mi.com_istrudev":"true",
    "serviceToken":"xxx",
    "i.mi.com_slh":"xxx"
}

# 请求头,模仿Chrome浏览器
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'application/json',
    'Content-Type': 'application/json',
}

# 获取当前时间戳
def get_current_timestamp():
    return int(time.time() * 1000)

# 获取全部笔记
def get_all_notes():
    ts = get_current_timestamp()
    url = f'https://i.mi.com/note/full/page?ts={ts}&limit=200'
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.json()

# 获取单个笔记的内容
def get_note_content(note_id):
    ts = get_current_timestamp()
    url = f'https://i.mi.com/note/note/{note_id}/?ts={ts}'
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.json()

# 获取单个笔记的历史版本
def get_note_history(note_id):
    ts = get_current_timestamp()
    url = f'https://i.mi.com/note/full/history/times?ts={ts}&id={note_id}'
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.json()

# 获取指定版本的内容
def get_note_version(note_id, version_id):
    ts = get_current_timestamp()
    url = f'https://i.mi.com/note/full/history?ts={ts}&id={note_id}&version={version_id}'
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.json()

# 存储笔记的内容到文件
def save_note(note_data, note_folder):
    # 存储最新笔记
    os.makedirs(note_folder, exist_ok=True)
    with open(os.path.join(note_folder, 'index.html'), 'w', encoding='utf-8') as f:
        f.write(note_data['data']['entry']['content'])

    # 存储笔记的JSON信息
    with open(os.path.join(note_folder, 'info'), 'w', encoding='utf-8') as f:
        json.dump(note_data, f, ensure_ascii=False, indent=4)

# 存储历史版本的笔记
def save_note_versions(note_id, versions, note_folder):
    for version in versions:
        version_id = version['version']
        version_data = get_note_version(note_id, version_id)
        timestamp = version['updateTime']

        # 存储该版本的笔记内容
        version_folder = os.path.join(note_folder, f'index.html_{timestamp}')
        with open(version_folder, 'w', encoding='utf-8') as f:
            f.write(version_data['data']['entry']['content'])

        # 存储该版本的JSON信息
        version_info = os.path.join(note_folder, f'info_{timestamp}')
        with open(version_info, 'w', encoding='utf-8') as f:
            json.dump(version_data, f, ensure_ascii=False, indent=4)

def main():
    # 获取所有笔记
    notes_data = get_all_notes()
    print(notes_data)
    if notes_data['result'] == 'ok':
        for note in notes_data['data']['entries']:
            note_id = note['id']
            print(f'正在处理笔记ID: {note_id}')

            # 获取笔记内容
            note_content_data = get_note_content(note_id)
            note_folder = f'./notes/{note_id}'
            save_note(note_content_data, note_folder)

            # 获取并保存历史版本
            note_history_data = get_note_history(note_id)
            if note_history_data['result'] == 'ok' and 'tvList' in note_history_data['data']:
                save_note_versions(note_id, note_history_data['data']['tvList'], note_folder)
            else:
                print(f'笔记ID {note_id} 没有历史版本。')

if __name__ == '__main__':
    main()

此时我们得到了一个notes文件夹,里面每个子文件夹都是一个笔记的全部版本,index.html为最新版本,index.html_xxx为历史版本,info为原始json数据

为知笔记容器分析

数据有了,要想导入到为知笔记中,还得分析一下它的工作原理。

首先进入为知笔记的容器中

docker exec -it iznote /bin/bash

发现里面有个MySQL容器,还有一个nodejs应用。在为知笔记中,每个用户都有一个UUID,在/wiz/storage/data_root/document2目录下可以看到所有用户的UUID(注意这里有系统用户的),笔记数据和历史版本数据就在这个目录下保存着

在里面看看为知的源码,发现/wiz/app/entrypoint.sh中容器内部MySQL的密码

为了在外部访问这个MySQL服务器,我们先添加个root@%的权限。

mysql -uroot -pxxxxxx
use mysql;
grant all privileges on *.* to 'root@%' identified by '密码' with grant option;
flush privileges

接着重新创建容器,把3306映射出来,即可外部访问了。

通过对MySQL里面的数据表的分析,发现所有笔记主要保存到wizksent.wiz_document表内:

CREATE TABLE `wiz_document` (
  `ID` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `DOCUMENT_GUID` binary(16) NOT NULL COMMENT '笔记GUID',
  `KB_GUID` binary(16) NOT NULL COMMENT '知识库GUID',
  `VERSION` bigint(20) NOT NULL DEFAULT '-1' COMMENT '版本号',
  `DOCUMENT_DATA_MD5` binary(16) DEFAULT NULL COMMENT '笔记数据md5',
  `DT_DATA_MODIFIED` datetime DEFAULT NULL COMMENT '笔记数据修改时间',
  `DOCUMENT_INFO_MD5` binary(16) DEFAULT NULL COMMENT '笔记基本信息md5',
  `DT_INFO_MODIFIED` datetime DEFAULT NULL COMMENT '笔记基本信息修改时间',
  `DOCUMENT_DATA_SIZE` int(11) NOT NULL COMMENT '笔记数据大小',
  `DOCUMENT_TITLE` varchar(255) DEFAULT NULL COMMENT '笔记标题',
  `DOCUMENT_CATEGORY` varchar(260) DEFAULT NULL COMMENT '笔记目录',
  `DOCUMENT_OWNER` varchar(150) DEFAULT NULL COMMENT '笔记所有者,用作标识普通editor能否删除和修改',
  `DOCUMENT_OWNER_GUID` binary(16) DEFAULT NULL COMMENT '文档作者的USERGUID',
  `DOCUMENT_ICON_INDEX` int(11) DEFAULT NULL COMMENT '笔记ICON索引',
  `DOCUMENT_PROTECT` tinyint(4) DEFAULT NULL COMMENT '笔记是否加密',
  `DOCUMENT_READ_COUNT` int(11) DEFAULT NULL COMMENT '笔记阅读次数',
  `DOCUMENT_ATTACHMENT_COUNT` int(11) DEFAULT NULL COMMENT '笔记附件数',
  `DOCUMENT_TYPE` varchar(16) DEFAULT NULL COMMENT '笔记类型',
  `DOCUMENT_FILE_TYPE` varchar(16) DEFAULT NULL COMMENT '笔记的文件类型',
  `DT_CREATED` datetime NOT NULL COMMENT '创建日期',
  `DT_ACCESSED` datetime DEFAULT NULL COMMENT '笔记最后访问日期',
  `GPS_LATITUDE` float(10,6) DEFAULT NULL COMMENT '纬度',
  `GPS_LONGITUDE` float(10,6) DEFAULT NULL COMMENT '经度',
  `DOCUMENT_URL` varchar(2048) DEFAULT NULL COMMENT '笔记 URL',
  `STYLE_GUID` binary(16) DEFAULT NULL COMMENT '样式GUID',
  `DOCUMENT_PARAM_MD5` binary(16) DEFAULT NULL COMMENT '笔记参数md5',
  `DT_PARAM_MODIFIED` datetime DEFAULT NULL COMMENT '笔记参数修改时间',
  `DOCUMENT_SEO` varchar(100) DEFAULT NULL COMMENT '博客发布插件使用',
  `DOCUMENT_AUTHOR` varchar(32) DEFAULT NULL COMMENT '助手插件设置的 author',
  `DOCUMENT_KEYWORDS` varchar(300) DEFAULT NULL COMMENT '助手插件设置的 keywords',
  `DOCUMENT_ABSTRACT_TEXT` varchar(255) DEFAULT NULL,
  `IS_DOCUMENT_ABSTRACT_IMAGE` tinyint(4) NOT NULL DEFAULT '0',
  `BODY_TEXT` longtext,
  `DOCUMENT_COVER_IMAGE` tinyint(4) DEFAULT NULL,
  `MARKERS` varchar(64) DEFAULT NULL,
  PRIMARY KEY (`ID`),
  UNIQUE KEY `uniq_doc_kb` (`DOCUMENT_GUID`,`KB_GUID`),
  KEY `kb_version` (`KB_GUID`,`VERSION`),
  KEY `index_kbguid_category` (`KB_GUID`,`DOCUMENT_CATEGORY`(191)),
  KEY `idx_kb_datamodified` (`KB_GUID`,`DT_DATA_MODIFIED`),
  FULLTEXT KEY `ft_index` (`DOCUMENT_TITLE`,`BODY_TEXT`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

其中DOCUMENT_GUID为笔记的GUID,KB_GUID为用户知识库的GUID,这个在MySQL中以二进制保存,所以是直接看就是乱码。

这么看来思路就非常清晰了,我们要先把笔记转成对应的格式,放在以DOCUMENT_GUID命名的文件夹下,最后把所有文件夹放在wiz/storage/data_root/document2/KB_GUID下面,应该就能实现导入了!

数据导入

通过上面的分析,写了个脚本直接导入:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import re
import random
import pymysql
import datetime

def generate_guid():
    """
    生成一个 GUID,前缀为 ff03,后面28位随机十六进制字符,总共32位。
	这里固定了个前缀是因为防止导入出错,文件混淆无法删除导入的数据。
	固定前缀后,如果导入出错,直接rm -rf ff03*即可回滚
    """
    return "ff03" + ''.join(random.choices('0123456789abcdef', k=28))

def replace_newlines(content):
    """
    将内容中的所有换行符替换成 <br>
    """
    return content.replace("\n", "<br>")

def remove_html(text):
    """
    删除文本中的所有 HTML 标签以及换行符
    """
    # 利用正则去除所有标签
    cleaned = re.sub(r'<[^>]*>', '', text)
    cleaned = cleaned.replace("\n", "").replace("\r", "")
    return cleaned

def format_guid(guid_str):
    """将无横杠的GUID字符串格式化为标准GUID格式"""
    if len(guid_str) != 32:
        raise ValueError("GUID字符串长度必须为32个字符")
    
    parts = [
        guid_str[0:8],
        guid_str[8:12],
        guid_str[12:16],
        guid_str[16:20],
        guid_str[20:]
    ]
    
    return '-'.join(parts)


def process_note(note_folder, output_base, db_conn):
    # 读取 info 文件
    info_file = os.path.join(note_folder, "info")
    with open(info_file, "r", encoding="utf-8") as f:
        info_data = json.load(f)
    entry = info_data["data"]["entry"]
    
    # 提取笔记信息
    snippet   = entry.get("snippet", "")
    modify_ms = entry.get("modifyDate")
    subject   = entry.get("subject", "").strip()
    content   = entry.get("content", "")
    create_ms = entry.get("createDate")
    
    # 如果笔记标题为空,则使用创建时间(格式化为 "YYYY-MM-DD HH:MM:SS")
    if not subject:
        subject = datetime.datetime.fromtimestamp(create_ms/1000).strftime("%Y-%m-%d %H:%M:%S")
    
    # 转换时间戳(单位:毫秒)为日期字符串
    dt_modify = datetime.datetime.fromtimestamp(modify_ms/1000).strftime("%Y-%m-%d %H:%M:%S")
    dt_create = datetime.datetime.fromtimestamp(create_ms/1000).strftime("%Y-%m-%d %H:%M:%S")
    
    # 生成笔记 GUID
    note_guid = generate_guid()
    # 创建入库文件夹
    output_folder = os.path.join(output_base, format_guid(note_guid))
    os.makedirs(output_folder, exist_ok=True)
    
    # ------------------
    # 处理最新版本文件
    # ------------------
    latest_index_path = os.path.join(note_folder, "index.html")
    with open(latest_index_path, "r", encoding="utf-8") as f:
        latest_content = f.read()
    # 替换换行符
    latest_content_processed = replace_newlines(latest_content)
    # 保存到入库文件夹
    with open(os.path.join(output_folder, "index.html"), "w", encoding="utf-8") as f:
        f.write(latest_content_processed)
    
    # 生成最新版本对应的 note.info 文件
    note_info_data = {
        "versionInfo": {
            "version": modify_ms,  # 使用最新版本的时间戳
            "editorGuid": "00000000-0000-0000-0000-000000000000",
            "clientType": "web",
            "clientVersion": "4.0"
        },
        "resources": []
    }
    with open(os.path.join(output_folder, "note.info"), "w", encoding="utf-8") as f:
        json.dump(note_info_data, f, ensure_ascii=False)
    
    # --------------------
    # 处理历史版本文件
    # --------------------
    for file_name in os.listdir(note_folder):
        if file_name.startswith("index.html_"):
            # 从文件名中获取时间戳部分,文件名格式:index.html_xxx
            timestamp = file_name[len("index.html_"):]
            version_path = os.path.join(note_folder, file_name)
            with open(version_path, "r", encoding="utf-8") as f:
                version_content = f.read()
            version_content_processed = replace_newlines(version_content)
            # 新的文件名:index.html_00{timestamp}00000000-0000-0000-0000-000000000000_web_4.0
            new_index_name = f"index.html_00{timestamp}_00000000-0000-0000-0000-000000000000_web_4.0"
            with open(os.path.join(output_folder, new_index_name), "w", encoding="utf-8") as f:
                f.write(version_content_processed)
            
            # 为历史版本生成对应的 note.info 文件
            version_info_filename = f"note.info_00{timestamp}_00000000-0000-0000-0000-000000000000_web_4.0"
            version_note_info_data = {
                "versionInfo": {
                    "version": int(timestamp),  # 转换为整数
                    "editorGuid": "00000000-0000-0000-0000-000000000000",
                    "clientType": "web",
                    "clientVersion": "4.0"
                },
                "resources": []
            }
            with open(os.path.join(output_folder, version_info_filename), "w", encoding="utf-8") as f:
                json.dump(version_note_info_data, f, ensure_ascii=False)
    
    # -----------------------
    # 将笔记注册到 MySQL 中
    # -----------------------
    # 固定值定义   
    kb_guid_str = "00000000000000000000000000000000"  # 去掉横杠后的 KB_GUID
    note_guid_bin = bytes.fromhex(note_guid)  # 将生成的 GUID 转为二进制
    kb_guid_bin   = bytes.fromhex(kb_guid_str)
    version_val   = -1
    document_data_size = 100
    document_category  = "/小米便签导入/"
    document_owner     = "admin@wiz.cn"
    document_protect   = 0
    document_read_count = 0
    document_attachment_count = 0
    document_type = ""
    dt_param_modified = "1970-01-01 08:00:00"
    document_icon_index = -1
    
    # 去除 HTML 标签和换行符,生成摘要和内容文本
    document_abstract_text = remove_html(snippet)
    body_text = remove_html(content)
    
    # 执行 SQL 插入
    with db_conn.cursor() as cursor:
        sql = """
        INSERT INTO wiz_document 
        (DOCUMENT_GUID, KB_GUID, VERSION, DT_DATA_MODIFIED, DT_INFO_MODIFIED, DOCUMENT_DATA_SIZE, DOCUMENT_TITLE, DOCUMENT_CATEGORY, DOCUMENT_OWNER, DT_CREATED, DT_ACCESSED, DOCUMENT_ABSTRACT_TEXT, BODY_TEXT, DOCUMENT_PROTECT, DOCUMENT_READ_COUNT, DOCUMENT_ATTACHMENT_COUNT, DOCUMENT_TYPE, DT_PARAM_MODIFIED, DOCUMENT_ICON_INDEX)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        cursor.execute(sql, (
            note_guid_bin,
            kb_guid_bin,
            version_val,
            dt_modify,  # DT_DATA_MODIFIED
            dt_modify,  # DT_INFO_MODIFIED
            document_data_size,
            subject,
            document_category,
            document_owner,
            dt_create,  # DT_CREATED
            dt_modify,  # DT_ACCESSED
            document_abstract_text[:254],
            body_text,
            document_protect,
            document_read_count,
            document_attachment_count,
            document_type,
            dt_param_modified,
            document_icon_index
        ))
        db_conn.commit()
    
    # -----------------------
    # 写入 success 文件
    # -----------------------
    with open(os.path.join(output_folder, "success"), "w", encoding="utf-8") as f:
        f.write("success")

def main():
    # 定义笔记和输出的根目录
    notes_base = "notes"
    output_base = "output"
    os.makedirs(output_base, exist_ok=True)
    
    # 建立 MySQL 数据库连接,请根据实际情况修改连接参数
    db_conn = pymysql.connect(
        host="172.17.0.2",
        user="root",
        port=3306,
        password="password",
        database="wizksent",
        charset="utf8mb4"
    )
    
    
    # 遍历 notes 目录下的所有子文件夹(每个文件夹代表一篇笔记)
    for note_dir in os.listdir(notes_base):
        note_path = os.path.join(notes_base, note_dir)
        if os.path.isdir(note_path):
            try:
                process_note(note_path, output_base, db_conn)
                print(f"已处理笔记:{note_dir}")
            except Exception as e:
                print(f"处理笔记 {note_dir} 时出错:{e}")
    
    db_conn.close()

if __name__ == "__main__":
    main()
转载原创文章请注明,转载自: 斐斐のBlog » 记一次小米笔记导入为知笔记
目前还没有评论,快来抢沙发吧~