google excel导入原理分析

对于erpnext上， google 的excel只要贴一个链接地址，就能把excel的数据导入，这是非常不错的功能。

今天针对frappe到底是如何处理的，做了一下代码分析，如下是抽取的代码逻辑：

分析：

这段代码演示了如何通过Python从Google Sheets中获取CSV数据，这是Google Sheets提供的一种导出数据的方式。让我解释一下其中的关键逻辑：

`validate_google_sheets_url(url)` 函数用于验证Google Sheets的URL是否有效。它使用Python的 `urllib.parse` 模块来解析URL，检查URL是否以 "https" 开头，是否是 "[docs.google.com](http://docs.google.com/)" 域名，以及是否包含 "/spreadsheets/" 这一部分。如果URL不满足这些条件，它会引发一个错误，表示URL无效。
`get_csv_content_from_google_sheets(url)` 函数用于获取Google Sheets的CSV数据。首先，它调用上面的验证函数来确保URL有效。然后，它从URL中提取 `gid`，这是工作表的标识符，默认值是0。接下来，它去掉URL的 "/edit" 部分，并在末尾添加 "/export?format=csv&gid={gid}"，以构建一个新的URL用于获取CSV数据。
使用 `requests` 库发送 HTTP GET 请求到新构建的URL，同时添加一个 `Accept` 头部，表示请求CSV格式的数据。
如果响应状态码为200，表示成功获取了数据，它会返回响应的内容（CSV数据的二进制形式）。如果响应内容以 "</html>" 结尾，说明可能由于URL无效或无访问权限，所以它会引发一个错误。如果状态码是400，它会引发另一个错误，表示URL必须以 "gid={number}" 结尾，需要检查URL是否正确。如果出现其他错误，它也会引发异常。
最后，它将获取的CSV数据解码为UTF-8编码的文本，并打印出来，以便进一步处理。


这段代码充分利用了Google Sheets的公开导出功能，该方法通过构建一个新的URL，去掉原始URL的 "/edit" 部分并在末尾添加 "/export?format=csv&gid={gid}"，从而获取 Google Sheets 数据的CSV格式。

代码：

import requests

def validate_google_sheets_url(url):
    from urllib.parse import urlparse

    u = urlparse(url)
    if u.scheme != "https" or u.netloc != "docs.google.com" or "/spreadsheets/" not in u.path:
        frappe.throw(
            _('"{0}" is not a valid Google Sheets URL').format(url),
            title=_("Invalid URL"),
        )
        
def get_csv_content_from_google_sheets(url):
    # https://docs.google.com/spreadsheets/d/{{sheetid}}/edit#gid={gid}
    validate_google_sheets_url(url)
    # get gid, defaults to first sheet
    if "gid=" in url:
        gid = url.rsplit("gid=", 1)[1]
    else:
        gid = 0
    # remove /edit path
    url = url.rsplit("/edit", 1)[0]
    # add /export path,
    url = url + f"/export?format=csv&gid={gid}"

    headers = {"Accept": "text/csv"}
    response = requests.get(url, headers=headers)

    if response.ok:
        # if it returns html, it couldn't find the CSV content
        # because of invalid url or no access
        if response.text.strip().endswith("</html>"):
            frappe.throw(
                _("Google Sheets URL is invalid or not publicly accessible."), title=_("Invalid URL")
            )
        return response.content
    elif response.status_code == 400:
        frappe.throw(
            _(
                'Google Sheets URL must end with "gid={number}". Copy and paste the URL from the browser address bar and try again.'
            ),
            title=_("Incorrect URL"),
        )
    else:
        response.raise_for_status()
        
url="https://docs.google.com/spreadsheets/d/1FpbcnWxxxxr49oCU/edit?usp=sharing"

# 解码二进制数据为文本
csv_data = get_csv_content_from_google_sheets(url).decode('utf-8')

# 打印文本数据
print(csv_data)

延展：

腾讯文档能否实现类似的逻辑呢？搜索了一番，目前是需要先打开腾讯文档，读取2个参数local_pad_id 和 cookie_value 的值。

维护之后，可以下载到这个文档的内容。

第2种方法，就需要通过应用api的对接，是可以读取的。

这个目前似乎还不可行，期待有看到此文的大神去试试是否有更智能的免API读取腾讯文档的内容的方法。

代码：

# -*- coding: UTF-8 -*-
"""
@Project :small-tools 
@File    :tengxun.py
@Author  :silen
@Time    :2022/5/26 15:42
@Description : 
"""
import json
import os
import re
import time
from datetime import datetime
from time import sleep
import click
import pandas as pd
import requests
from bs4 import BeautifulSoup


class TengXunDocument():

    def __init__(self, document_url, local_pad_id, cookie_value):
        # excel文档地址
        self.document_url = document_url
        # 此值每一份腾讯文档有一个,需要手动获取
        self.localPadId = local_pad_id
        self.headers = {
            'content-type': 'application/x-www-form-urlencoded',
            'Cookie': cookie_value
        }

    def get_now_user_index(self):
        """
        # 获取当前用户信息,供创建下载任务使用
        :return:
            # nowUserIndex = '4883730fe8b94fbdb94da26a9a63b688'
            # uid = '144115225804776585'
            # utype = 'wx'
        """
        response_body = requests.get(url=self.document_url, headers=self.headers, verify=False)
        parser = BeautifulSoup(response_body.content, 'html.parser')
        global_multi_user_list = re.findall(re.compile('window.global_multi_user=(.*?);'), str(parser))
        if global_multi_user_list:
            user_dict = json.loads(global_multi_user_list[0])
            print(user_dict)
            return user_dict['nowUserIndex']
        return 'cookie过期,请重新输入'

    def export_excel_task(self, export_excel_url):
        """
        导出excel文件任务,供查询文件数据准备进度
        :return:
        """
        body = {
            'docId': self.localPadId, 'version': '2'
        }

        res = requests.post(url=export_excel_url,
                                      headers=self.headers, data=body, verify=False)
        operation_id = res.json()['operationId']
        return operation_id



    def download_excel(self, check_progress_url, file_name):
        """
        下载excel文件
        :return:
        """
        # 拿到下载excel文件的url
        start_time = time.time()
        file_url = ''
        while True:
            res = requests.get(url=check_progress_url, headers=self.headers, verify=False)
            progress = res.json()['progress']
            if progress == 100:
                file_url = res.json()['file_url']
                break
            elif time.time() - start_time > 30:
                print("数据准备超时,请排查")
                break
        if file_url:
            self.headers['content-type'] = 'application/octet-stream'
            res = requests.get(url=file_url, headers=self.headers, verify=False)
            with open(file_name, 'wb') as f:
                f.write(res.content)
            print('下载成功,文件名: ' + file_name)
        else:
            print("下载文件地址获取失败, 下载excel文件不成功")

    def download_csv(self, check_progress_url, file_name):
        """
        下载CSV文件
        :return:
        """
        # 拿到下载CSV文件的url
        start_time = time.time()
        file_url = ''
        while True:
            res = requests.get(url=check_progress_url, headers=self.headers, verify=False)
            progress = res.json()['progress']
            if progress == 100:
                file_url = res.json()['file_url']
                break
            elif time.time() - start_time > 30:
                print("数据准备超时,请排查")
                break
        if file_url:
            # 更新content-type为CSV格式
            self.headers['content-type'] = 'text/csv'
            res = requests.get(url=file_url, headers=self.headers, verify=False)
            with open(file_name, 'wb') as f:
                f.write(res.content)
            print('下载成功,文件名: ' + file_name)
        else:
            print("下载文件地址获取失败, 下载CSV文件不成功")            


if __name__ == '__main__':
    # excel文档地址
    document_url = 'https://docs.qq.com/sheet/DV2dyxxxxJwY0pa'
    # 此值每一份腾讯文档有一个,需要手动获取
    local_pad_id = '300000000$WgxxxxxJRpcJZ'
    # 打开腾讯文档后,从抓到的接口中获取cookie信息
    cookie_value = 'xxxx'
    tx = TengXunDocument(document_url, local_pad_id, cookie_value)
    now_user_index = tx.get_now_user_index()
    # 导出文件任务url
    export_excel_url = f'https://docs.qq.com/v1/export/export_office?u={now_user_index}'
    # 获取导出任务的操作id
    operation_id = tx.export_excel_task(export_excel_url)
    check_progress_url = f'https://docs.qq.com/v1/export/query_progress?u={now_user_index}&operationId={operation_id}'
#     current_datetime = datetime.strftime(datetime.now(), '%Y_%m_%d_%H_%M_%S')
#     file_name = f'{current_datetime}.xlsx'
#     tx.download_excel(check_progress_url, file_name)

    current_datetime = datetime.strftime(datetime.now(), '%Y_%m_%d_%H_%M_%S')
    file_name = f'{current_datetime}.csv'
    tx.download_csv(check_progress_url, file_name)

google excel导入原理分析

分析：​

代码：​

延展：​

分析：

代码：

延展：